diff --git a/.gitattributes b/.gitattributes
index 5a815654b4c..bede44edf8a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,4 +15,6 @@ windows/INSTALL*   eol=native
 windows/NewGuidCmd.exe.config text eol=crlf
 windows/NewGuidCmd.exe binary
 
+# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows).
+**/*.patch            -text
 
diff --git a/.gitignore b/.gitignore
index f80ffac482d..62c22459577 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,12 @@
 !/src/*/Makefile
 !/src/*/README
 
-# Compiled Object files
+# Compiled Object files and python ciles
 *.slo
 *.lo
 *.o
 *.obj
+*.pyc
 
 # Compiled Dynamic libraries
 *.so
diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh
index 9bc2b3195ef..5ec5d4b715f 100644
--- a/egs/ami/s5/cmd.sh
+++ b/egs/ami/s5/cmd.sh
@@ -1,9 +1,24 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 2G"
+# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1
+# scripts.
+export cuda_cmd="queue.pl --gpu 1 --mem 20G"
+
+# the rest of this file is present for historical reasons.
+# In general it's best to rely on conf/queue.conf for cluster-specific
+# configuration.
 
 # On Eddie use:
 #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
@@ -11,27 +26,13 @@
 #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
 #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=00:20:00"
 
-# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay)
-export train_cmd="queue.pl -l arch=*64* --mem 1G"
-export decode_cmd="queue.pl -l arch=*64* --mem 2G"
-export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
-export scoring_cmd="queue.pl -l arch=*64*"
-export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G"
-export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"
-
-# To run locally, use:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export highmem_cmd=run.pl
-#export cuda_cmd=run.pl
-
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
-fi 
+fi
 
diff --git a/egs/ami/s5/local/nnet3/run_lstm.sh b/egs/ami/s5/local/nnet3/run_lstm.sh
index 29ebf6ca601..d077d14cc1e 100755
--- a/egs/ami/s5/local/nnet3/run_lstm.sh
+++ b/egs/ami/s5/local/nnet3/run_lstm.sh
@@ -18,7 +18,7 @@
 stage=0
 train_stage=-10
 mic=ihm
-use_ihm_ali=false 
+use_ihm_ali=false
 use_sat_alignments=false # if true, use tri4a alignments are used
                          # by default GMM-HMM systems are not built to this stage
                          # in SDM and MDM systems. So run the tri4a stage if you
@@ -66,7 +66,7 @@ decode_iter=
 
 echo "$0 $@"  # Print the command line for logging
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index b4d41d7066a..b9d60d78182 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -10,13 +10,13 @@ mic=ihm
 stage=0
 . utils/parse_options.sh
 
-# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : 
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
 # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
 set -euxo pipefail
 
 # Path where AMI gets downloaded (or where locally available):
-AMI_DIR=$PWD/wav_db # Default, 
-case $(hostname -d) in 
+AMI_DIR=$PWD/wav_db # Default,
+case $(hostname -d) in
   fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
@@ -86,7 +86,7 @@ if [ $stage -le 5 ]; then
     data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali
   # Decode,
   graph_dir=exp/$mic/tri2a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM}
@@ -104,26 +104,26 @@ if [ $stage -le 6 ]; then
     data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali
   # Decode,
   graph_dir=exp/$mic/tri3a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM}
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM}
-fi 
+fi
 
 if [ $stage -le 7 ]; then
   # Train tri4a, which is LDA+MLLT+SAT,
   steps/train_sat.sh  --cmd "$train_cmd" \
     5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a
-  # Decode,  
+  # Decode,
   graph_dir=exp/$mic/tri4a/graph_${LM}
   $highmem_cmd $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM}
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} 
+    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM}
 fi
 
 nj_mmi=80
@@ -160,11 +160,11 @@ if [ $stage -le 11 ]; then
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \
-      $graph_dir data/$mic/dev $decode_dir 
+      $graph_dir data/$mic/dev $decode_dir
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \
-      $graph_dir data/$mic/eval $decode_dir 
+      $graph_dir data/$mic/eval $decode_dir
   done
 fi
 
@@ -181,7 +181,7 @@ if [ $stage -le 13 ]; then
     --hidden-dim 950 \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \
     --use-sat-alignments true
-  
+
   local/online/run_nnet2_ms_sp_disc.sh  \
     --mic $mic  \
     --gmm-dir exp/$mic/tri4a \
diff --git a/egs/aspire/s5/local/nnet3/run_autoencoder.sh b/egs/aspire/s5/local/nnet3/run_autoencoder.sh
new file mode 100644
index 00000000000..abc7f3a6234
--- /dev/null
+++ b/egs/aspire/s5/local/nnet3/run_autoencoder.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# this is an example to show a "tdnn" system in raw nnet configuration
+# i.e. without a transition model
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+affix=
+train_stage=-10
+common_egs_dir=
+num_data_reps=10
+
+remove_egs=true
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_raw
+dir=$dir${affix:+_$affix}
+
+clean_data_dir=data/train
+data_dir=data/train_rvb
+targets_scp=$dir/targets.scp
+
+mkdir -p $dir
+
+# Create copies of clean feats with prefix "rev$x_" to match utterance names of
+# the noisy feats
+for x in `seq 1 $num_data_reps`; do
+  awk -v x=$x '{print "rev"x"_"$0}' $clean_data_dir/feats.scp | sort -k1,1 > $targets_scp
+done
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+  
+  num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+     --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+     --feat-dir ${data_dir} \
+     --relu-dim=1024 \
+     --add-lda=false \
+     --objective-type=quadratic \
+     --add-final-sigmoid=false \
+     --include-log-softmax=false \
+     --use-presoftmax-prior-scale=false \
+     --num-targets=$num_targets \
+     $dir/configs || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/tdnn/train_raw_nnet.sh --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --num-epochs 2 \
+    --num-jobs-initial 3 \
+    --num-jobs-final 16 \
+    --initial-effective-lrate 0.0017 \
+    --final-effective-lrate 0.00017 \
+    --egs-dir "$common_egs_dir" \
+    --remove-egs $remove_egs \
+    --use-gpu true \
+    --dense-targets true \
+    ${data_dir} $targets_scp $dir || exit 1
+fi
+
diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh
index 139b2cd6c6c..378febca15b 100644
--- a/egs/aurora4/s5/cmd.sh
+++ b/egs/aurora4/s5/cmd.sh
@@ -1,29 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still used in some example scripts
+# here.
 export cuda_cmd="queue.pl --gpu 1"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5/cmd.sh
+++ b/egs/babel/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/babel/s5b/cmd.sh
+++ b/egs/babel/s5b/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5c/cmd.sh
+++ b/egs/babel/s5c/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh
index 27d1d36a6a6..d1ca1a6d126 100755
--- a/egs/bn_music_speech/v1/cmd.sh
+++ b/egs/bn_music_speech/v1/cmd.sh
@@ -1,17 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-
-#c) run it locally...
-#export train_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh
index ab29f13d4cc..71dd849a93b 100755
--- a/egs/callhome_egyptian/s5/cmd.sh
+++ b/egs/callhome_egyptian/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh
index dda6226f419..0dcd5a9200f 100755
--- a/egs/chime1/s5/cmd.sh
+++ b/egs/chime1/s5/cmd.sh
@@ -1,39 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-
-#c) USFD cluster options
-#config="conf/queue_usfd.conf"
-#export train_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export decode_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export mkgraph_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export cuda_cmd="queue.pl  --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00"
-
-
-#d) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh
index 8bb00fe0ec6..0dcd5a9200f 100644
--- a/egs/chime2/s5/cmd.sh
+++ b/egs/chime2/s5/cmd.sh
@@ -1,30 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
-export cuda_cmd="queue.pl -l gpu=1"
-#export cuda_cmd="..."
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh
index d5952fe0f87..71dd849a93b 100644
--- a/egs/csj/s5/cmd.sh
+++ b/egs/csj/s5/cmd.sh
@@ -1,31 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64*"
-#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export train_cmd="run.pl"
-export decode_cmd="run.pl"
-#export cuda_cmd="..."
-#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export mkgraph_cmd="run.pl"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh
index d749f2c9f1f..71dd849a93b 100644
--- a/egs/farsdat/s5/cmd.sh
+++ b/egs/farsdat/s5/cmd.sh
@@ -1,25 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export cuda_cmd="run.pl"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" 
-
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh
index ab29f13d4cc..88db78823a5 100755
--- a/egs/fisher_callhome_spanish/s5/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index f453ab42058..8fe80b46784 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 #
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) 
-# In addition the transcripts are needed as well. 
+# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files)
+# In addition the transcripts are needed as well.
 # To be run from one directory above this script.
 
 # Note: when creating your own data preparation scripts, it's a good idea
 # to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the 
+# id, that the output scp file is sorted on utterance id, and that the
 # transcription file is exactly the same length as the scp file and is also
 # sorted on utterance id (missing transcriptions should be removed from the
 # scp file using e.g. scripts/filter_scp.pl)
@@ -18,8 +18,8 @@ export LC_ALL=C
 
 
 if [ $# -lt 2 ]; then
-   echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se
-e ../run.sh for example."
+   echo "Usage: $0 <LDC2010S01-location> <LDC2010T04-location>"
+   echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04"
    exit 1;
 fi
 
@@ -72,20 +72,20 @@ fi
 
 speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
 speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-transcripts=$dir/links/LDC2010T04/data/transcripts                                 
-                                                                                   
-fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`                                             
-fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`                                             
-fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`                                            
-#TODO:it seems like not all speech files have transcripts             
+transcripts=$dir/links/LDC2010T04/data/transcripts
+
+fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
+fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
+#TODO:it seems like not all speech files have transcripts
 #Now check if we got all the files that we needed
-if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];                 
-then                                                                               
-        echo "Incorrect number of files in the data directories"                   
-        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"  
-        echo "The transcripts should contain 819 files"                            
-        exit 1;                                                                    
-fi   
+if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+then
+        echo "Incorrect number of files in the data directories"
+        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"
+        echo "The transcripts should contain 819 files"
+        exit 1;
+fi
 
 if [ $stage -le 0 ]; then
 	#Gather all the speech files together to create a file list
@@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then
 	mv $tmpdir/reco2file_and_channel $dir/train_all/
 fi
 
-if [ $stage -le 2 ]; then                                                        
+if [ $stage -le 2 ]; then
   sort $tmpdir/text.1 | grep -v '((' | \
   awk '{if (NF > 1){ print; }}' | \
   sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
@@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then
   for f in `cat $tmpdir/train_sph.flist`; do
     # convert to absolute path
     readlink -e $f
-  done > $tmpdir/train_sph_abs.flist  
+  done > $tmpdir/train_sph_abs.flist
 
   cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
   cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 0f2bd037ba0..6d04f53c7e5 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -22,12 +22,32 @@ lexicon=$1
 #Get all unique words, remove punctuation.
 if [ $stage -le 0 ]; then
   cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
-  if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then
-    # Merge with gigaword corpus
-    $local/merge_lexicons.py
-    mv $tmpdir/uniquewords $tmpdir/uniquewords.small
-    mv $tmpdir/uniquewords64k $tmpdir/uniquewords
+  if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
+    echo "Could not find the large collection of Spanish words es_wordlist.json"
+    echo "Trying to download it via wget"
+
+    if ! which wget >&/dev/null; then
+      echo "This script requires you to first install wget"
+      exit 1;
+    fi
+
+    cwd=`pwd`
+    cd $tmpdir
+    wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
+
+    if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
+      echo "Download of the large Spanish word list failed"
+      exit 1;
+    fi
+
+    tar -xovzf es_wordlist.json.tgz || exit 1;
+    cd $cwd
   fi
+
+  # Merge with gigaword corpus
+  $local/merge_lexicons.py ${tmpdir} ${lexicon}
+  mv $tmpdir/uniquewords $tmpdir/uniquewords.small
+  mv $tmpdir/uniquewords64k $tmpdir/uniquewords
 fi
 
 #Then get the list of phones form basic_rules in the lexicon folder
@@ -50,6 +70,7 @@ if [ $stage -le 2 ]; then
   # representation
   cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
     | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
+    | awk -F '[/][/]' '{print $1}' \
     > $tmpdir/lexicon_raw
 fi
 
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 8c67ae56804..5c09f09bc35 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -7,55 +7,58 @@
 import sys
 import json
 import codecs
-import os
 import operator
 
-wordlimit=64000
-uw_fisher="data/local/tmp/uniquewords"
-uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json"
-uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences"
+wordlimit = 64000
+tmpdir = sys.argv[1]
+ldc_lexicon = sys.argv[2]
+uw_fisher = tmpdir + "/uniquewords"
+uw_gigaword = tmpdir + "/es_wordlist.json"
+uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
 fisher = codecs.open(uw_fisher, encoding='utf-8')
 for line in fisher:
-	merged_lexicon.append(line.strip())
+    merged_lexicon.append(line.strip())
 fisher.close()
 
-print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the fisher data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now add data from the LDC lexicon
 ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
-for line in ldc: 
-	entries = line.strip().split('\t')
-	if entries[0].lower() not in merged_lexicon:
-		merged_lexicon.append(entries[0].lower())
+for line in ldc:
+    entries = line.strip().split('\t')
+    if entries[0].lower() not in merged_lexicon:
+        merged_lexicon.append(entries[0].lower())
 
-print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the LDC data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Finally add the gigaword data
 gigaword = json.load(open(uw_gigaword))
 gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
 
 for item in gigaword:
-	# We need a maximum of wordlimit words in the lexicon
-	if len(merged_lexicon) == wordlimit:
-		break	
+    # We need a maximum of wordlimit words in the lexicon
+    if len(merged_lexicon) == wordlimit:
+        break
 
-	if item[0].lower() not in merged_lexicon:
-		merged_lexicon.append(item[0].lower())
-	
-print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+    if item[0].lower() not in merged_lexicon:
+        merged_lexicon.append(item[0].lower())
+
+print "After adding the Gigaword data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now write the uniquewords to a file
-lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+')
+lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-	lf.write(item + "\n")
+    lf.write(item + "\n")
 
 lf.close()
 
 print "Finshed writing unique words"
-
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 706f3793278..edd7f56bad2 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -17,12 +17,10 @@ set -e
 sfisher_speech=/home/mpost/data/LDC/LDC2010S01
 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
-#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt
 split=local/splits/split_fisher
 
 callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
-#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome
 split=local/splits/split_callhome
 
 local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
@@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon
 
 # Rewrite ----------------------------- This section is no longer needed----
 # At this point, it might make sense to use a bigger lexicon
-# The one I will use is derived from this exercise (spanish fisher) and 
-# the LDC spanish lexicon along with the most frequent words derived from the 
+# The one I will use is derived from this exercise (spanish fisher) and
+# the LDC spanish lexicon along with the most frequent words derived from the
 # gigaword corpus such that the total number of entries in the lexicon
 # are 64k
 
 # To generate the merged lexicon, run
 # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py
 # you might have to set the locations of the three lexicons within this
-# file. Note that the LDC rule base phoneme generator works only from its 
-# own directory. So the merged lexicon is actually created in 
+# file. Note that the LDC rule base phoneme generator works only from its
+# own directory. So the merged lexicon is actually created in
 # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k
 # This can be easily fixed and will be done. #TODO
 # Also run the clean lexicon script to take care of non stressable vowels
@@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 
 # Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an 
+# Some form of cross validation is possible where you decode your dev/set based on an
 # LM that is trained on  everything but that that conversation
 # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-# to get the numbers. Depending on your needs, you might have to change the size of 
-# the splits within that file. The default paritions are based on the Kaldi + Joshua 
+# to get the numbers. Depending on your needs, you might have to change the size of
+# the splits within that file. The default paritions are based on the Kaldi + Joshua
 # requirements which means that I have very large dev and test sets
 local/fsp_train_lms.sh $split
 local/fsp_create_test_lang.sh
@@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all
 # MT Tune : Same as the ASR eval set (Use the lattices from here)
 # MT Eval : 20k utterances
 # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. 
+# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
 # As noted above, the LM has not been trained on the dev and the test sets.
 #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
 #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
@@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
 utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
 local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
 utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
-utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k  
+utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
   data/train_10k_nodup data/lang exp/mono0a
@@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
    exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
 )&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/fisher_english/s5/cmd.sh
+++ b/egs/fisher_english/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh
index e3294fde05a..88db78823a5 100644
--- a/egs/fisher_swbd/s5/cmd.sh
+++ b/egs/fisher_swbd/s5/cmd.sh
@@ -1,32 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-#d) Gorgon cluster
-#export train_cmd="gorgon_queue.pl -q gorgon"
-#export decode_cmd="gorgon_queue.pl -q gorgon"
-#export cuda_cmd="gorgon_queue.pl -q gorgon"
-#export mkgraph_cmd="gorgon_queue.pl -q gorgon"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_swbd/s5/conf/MSU_single_letter.txt b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt
new file mode 100644
index 00000000000..1f7b419cca7
--- /dev/null
+++ b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt
@@ -0,0 +1,26 @@
+A ey
+B b iy
+C s iy
+D d iy
+E iy
+F eh f
+G jh iy
+H ey ch
+I ay
+J jh ey
+K k ey
+L eh l
+M eh m
+N eh n
+O ow
+P p iy
+Q k y uw
+R aa r
+S eh s
+T t iy
+U y uw
+V v iy
+W d ah b ax l y uw
+X eh k s
+Y w ay
+Z z iy
diff --git a/egs/fisher_swbd/s5/local/dict.patch b/egs/fisher_swbd/s5/local/dict.patch
new file mode 100644
index 00000000000..7fcaa98b4f5
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/dict.patch
@@ -0,0 +1,378 @@
+8645a8646
+> uh-hum ah m hh ah m
+9006c9007
+< April ey p r ih l
+---
+> April ey p r ax l
+9144d9144
+< B ay zh aa n iy z
+9261c9261
+< Battle b ae t el
+---
+> Battle b ae t ax l
+10014a10015
+> Chevy sh eh v iy
+10211a10213
+> Colorado k ao l ax r aa d ow
+10212a10215
+> Colorado' k ao l ax r aa d ow z
+10370c10373
+< Creek k r ih k
+---
+> Creek k r iy k
+10889a10893
+> Eleven ax l eh v ih n
+10951c10955
+< Erie ih r iy
+---
+> Erie iy r iy
+11183c11187
+< Forever f ax r eh v er
+---
+> Forever f er eh v er
+11231a11236
+> Friday f r ay d iy
+11744a11750
+> History hh ih s t r iy
+12004a12011,12012
+> Israel ih z r ih l
+> Israel's ih z r ih l z
+12573a12582
+> Lincoln l ih ng k ih n
+12574a12584
+> Lincolns l ih ng k ih n z
+13268c13278
+< NAACP eh ey ey s iy p iy
+---
+> NAACP eh n ey ey s iy p iy
+13286c13296
+< NIT eh ay t iy
+---
+> NIT eh n ay t iy
+13292c13302
+< NTSC eh t iy eh s s iy
+---
+> NTSC eh n t iy eh s s iy
+14058a14069
+> Quarter k ow r t er
+14059a14071
+> Quarterback k ow r t er b ae k
+14060a14073
+> Quarters k ow r t er z
+14569a14583
+> Science s ay n s
+15087a15102
+> Sunday s ah n d iy
+15088a15104
+> Sunday's s ah n d iy z
+15089a15106
+> Sundays s ah n d iy z
+15290,15291c15307,15308
+< Texan t eh k sh ih n
+< Texan's t eh k sh ih n s
+---
+> Texan t eh k s ih n
+> Texan's t eh k s ih n s
+15335a15353
+> Thousands th aw z ih n z
+15739c15757
+< Waco w ae k ow
+---
+> Waco w ey k ow
+15841a15860
+> Weekends w iy k eh n z
+16782a16802
+> acceptable eh k s eh p ax b ax l
+16833a16854
+> accounting ax k aw n ih ng
+16948a16970
+> address ax d r eh s
+17281a17304
+> already aa r d iy
+17315a17339
+> am m
+17709a17734
+> asked ae s t
+17847a17873
+> attorney ih t er n iy
+17919a17946
+> autopilot ao t ow p ay l ih t
+17960a17988
+> awfully ao f l iy
+18221a18250
+> basketball b ae s k ax b ao l
+18222a18252
+> basketball's b ae s k ax b ao l z
+18302a18333
+> become b ah k ah m
+18303a18335
+> becomes b iy k ah m z
+18344a18377
+> began b ax g en n
+18817c18850
+< bottle b aa t el
+---
+> bottle b aa t ax l
+19332,19333c19365,19367
+< camera's k ae m ax r ax z
+< cameras k ae m ax r ax z
+---
+> camera k ae m r ax
+> camera's k ae m r ax z
+> cameras k ae m r ax z
+19411a19446
+> capital k ae p ax l
+19505a19541
+> carrying k ae r ih ng
+20316a20353,20354
+> combination k aa m ih n ey sh ih n
+> combinations k aa m ih n ey sh ih n z
+20831a20870
+> contracts k aa n t r ae k s
+21010a21050
+> costs k ao s
+21062a21103
+> county k aw n iy
+21371a21413
+> cultural k ao l ch ax r ax l
+21372a21415
+> culturally k ao l ch ax r ax l iy
+21373a21417
+> culture k ao l ch er
+21375a21420
+> cultures k ao l ch er z
+21543a21589
+> data d ey t ax
+22097a22144
+> differently d ih f ax r ih n t l iy
+22972a23020
+> effects ax f eh k t s
+23016a23065
+> election ax l eh k sh ih n
+23018a23068
+> elections ax l eh k sh ih n z
+23052a23103
+> eleven ax l eh v ih n
+23242a23294
+> enjoyable ae n jh oy ax b ax l
+23248a23301
+> enjoys ae n jh oy z
+23293a23347
+> entire ih n t ay r
+23295a23350,23351
+> entirely ih n t ay r l iy
+> entirety ih n t ay r t iy
+23745a23802
+> extra eh k s t er
+23818a23876
+> facts f ae k s
+24508c24566
+< forever f ax r eh v er
+---
+> forever f er eh v er
+24514c24572
+< forget f ow r g eh t
+---
+> forget f er r g eh t
+24521a24580
+> forgot f er r g aa t
+24522a24582
+> forgotten f er r g aa t ax n
+24563a24624
+> forward f ow er d
+24680a24742
+> frightening f r ay t n ih ng
+24742a24805
+> full-time f ax l t ay m
+24862a24926
+> garage g r aa jh
+25218a25283
+> grandmother g r ae m ah dh er
+25790a25856
+> heavily hh eh v ax l iy
+25949a26016
+> history hh ih s t r iy
+26038a26106
+> honestly aa n ax s t l iy
+26039a26108
+> honesty aa n ax s t iy
+26099a26169
+> horror hh ow r 
+26155a26226
+> houses hh aw z ih z
+26184c26255
+< huh-uh hh ah hh ah
+---
+> huh-uh ah hh ah
+26189c26260
+< hum-um hh m hh m
+---
+> hum-um ah m hh ah m
+26236a26308
+> hunting hh ah n ih ng
+26307a26380,26381
+> ideal ay d iy l
+> idealist ay d iy l ih s t
+26369a26444
+> imagine m ae jh ih n
+26628a26704
+> individuals ih n d ih v ih jh ax l z
+26968a27045
+> interest ih n t r ih s t
+27184a27262
+> it'd ih d
+27702a27781
+> lead l iy d
+28378a28458
+> mandatory m ae n d ih t ow r iy
+28885a28966
+> minute m ih n ih t
+29167a29249
+> mountains m aw t n z
+29317a29400
+> mysteries m ih s t r iy z
+29318a29402
+> mystery m ih s t r iy
+29470a29555
+> nervous n er v ih s
+29578,29580c29663,29665
+< nobody n ow b aa d iy
+< nobody'll n ow b aa d iy l
+< nobody's n ow b aa d iy z
+---
+> nobody n ow b ah d iy
+> nobody'll n ow b ah d iy l
+> nobody's n ow b ah d iy z
+29712a29798
+> nuclear n uw k l iy r
+29938a30025
+> onto aa n t ax
+30051a30139
+> originally ax r ih jh ax l iy
+30507a30596
+> particularly p er t ih k y ax l iy
+30755a30845
+> perfectly p er f ih k l iy
+30820a30911
+> personally p er s n ax l iy
+30915a31007
+> physically f ih z ih k l iy
+30986a31079
+> pilot p ay l ih t
+30987a31081
+> pilot's p ay l ih t s
+31227a31322
+> police p l iy s
+31513a31609
+> prefer p er f er
+31553a31650
+> prepare p r ax p ey r
+31578a31676
+> prescription p er s k r ih p sh ih n
+31579a31678
+> prescriptions p er s k r ih p sh ih n z
+31770a31870
+> products p r aa d ax k s
+31821a31922
+> projects p r aa jh eh k s
+31908a32010
+> protect p er t eh k t
+31909a32012
+> protected p er t eh k t ih d
+31911a32015
+> protection p er t eh k sh ih n
+31914a32019
+> protection p er t eh k t ih v
+32149a32255
+> quarter k ow r t er
+32414a32521
+> read r iy d
+32785a32893
+> rehabilitation r iy ax b ih l ih t ey sh ih n
+33150a33259
+> resource r ih s ow r s
+33151a33261
+> resources r iy s ow r s ih z
+33539c33649
+< roots r uh t s
+---
+> roots r uw t s
+33929a34040
+> science s ay n s
+34315a34427
+> seventy s eh v ih n iy
+34319,34320c34431,34432
+< severe s ax v iy r
+< severely s ax v iy r l iy
+---
+> severe s ih v iy r
+> severely s ih v iy r l iy
+35060a35173
+> software s ao f w ey r
+35083a35197
+> solid s ao l ih d
+35084a35199
+> solidly s ao l ih d l iy
+35750a35866
+> stood s t ih d
+35854a35971
+> strictly s t r ih k l iy
+35889c36006
+< stronger s t r ao ng er
+---
+> stronger s t r ao ng g er
+36192a36310,36311
+> supposed s p ow z
+> supposed s p ow s
+36510a36630
+> tastes t ey s
+36856a36977
+> thoroughly th er r l iy
+36866a36988
+> thousands th aw z ih n z
+37081c37203
+< toots t uh t s
+---
+> toots t uw t s
+37157a37280
+> toward t w ow r d
+37158a37282
+> towards t w ow r d z
+37564a37689
+> twenties t w eh n iy z
+37565a37691
+> twentieth t w eh n iy ih th
+37637a37764
+> unacceptable ah n ae k s eh p ax b ax l
+37728a37856
+> understand ah n d er s t ae n
+37860a37989
+> unless ih n l eh s
+38040a38170
+> use y uw z
+38049a38180
+> uses y uw z ih z
+38125a38257
+> various v ah r iy ih s
+38202a38335
+> versus v er s ih z
+38381c38514
+< wacko w ae k ow
+---
+> wacko w ey k ow
+38455c38588
+< wanna w aa n ax
+---
+> wanna w ah n ax
+38675c38808
+< whatnot w ah t n aa t
+---
+> whatnot w aa t n aa t
+38676a38810
+> whatsoever w aa t s ow eh v er
+38890c39024
+< wok w aa k
+---
+> wok w ao k
+38910a39045
+> wondering w ah n d r ih ng
diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
index 5d8b9e2e18d..a9e3fa4566a 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
@@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
  else
    echo Downloading and installing the kaldi_lm tools
    if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
      wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
    fi
    tar -xvzf kaldi_lm.tar.gz || exit 1;
diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
index ebc954b756b..3133af6ee1f 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
@@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
  else
    echo Downloading and installing the kaldi_lm tools
    if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
      wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
    fi
    tar -xvzf kaldi_lm.tar.gz || exit 1;
diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
new file mode 100755
index 00000000000..95c9d5e58a4
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Switchboard-1 training data preparation customized for Edinburgh
+# Author:  Arnab Ghoshal (Jan 2013)
+
+# To be run from one directory above this script.
+
+## The input is some directory containing the switchboard-1 release 2
+## corpus (LDC97S62).  Note: we don't make many assumptions about how
+## you unpacked this.  We are just doing a "find" command to locate
+## the .sph files.
+
+. path.sh
+
+#check existing directories
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
+  exit 1; 
+fi 
+
+SWBD_DIR=$1
+
+dir=data/local/train_swbd
+mkdir -p $dir
+
+# Audio data directory check
+if [ ! -d $SWBD_DIR ]; then
+  echo "Error: run.sh requires a directory argument"
+  exit 1; 
+fi  
+
+# Trans directory check
+if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
+  ( 
+    cd $dir;
+    if [ ! -d swb_ms98_transcriptions ]; then
+      echo " *** Downloading trascriptions and dictionary ***" 
+      wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
+      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
+      tar -xf switchboard_word_alignments.tar.gz
+    fi
+  )
+else
+  echo "Directory with transcriptions exists, skipping downloading"
+  [ -f $dir/swb_ms98_transcriptions ] \
+    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
+fi
diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
index 552e304a6a3..54513437dbe 100755
--- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
@@ -14,7 +14,7 @@
 
 #check existing directories
 if [ $# != 1 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -23,7 +23,6 @@ SWBD_DIR=$1
 dir=data/local/train_swbd
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
@@ -34,22 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  # To get the SWBD transcriptions and dict, do:
-  echo " *** Downloading transcriptions and dictionary ***"   
-  ( 
-    cd $dir;
-    wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-    tar -xf switchboard_word_alignments.tar.gz
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;
@@ -101,7 +84,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final trans
 
 # format acronyms in text
 python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
-  -M data/local/dict/acronyms_swbd.map
+  -M data/local/dict_nosp/acronyms_swbd.map
 cp $dir/text $dir/text_bk
 mv $dir/text_map $dir/text
 
diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh
index 4bb0a55b0a9..fa3ad62fa84 100755
--- a/egs/fisher_swbd/s5/run.sh
+++ b/egs/fisher_swbd/s5/run.sh
@@ -25,7 +25,6 @@ local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
 # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
 
-
 utils/prepare_lang.sh data/local/dict_nosp \
     "<unk>" data/local/lang_nosp data/lang_nosp
 
@@ -135,15 +134,14 @@ local/remove_dup_utts.sh 300 data/train data/train_nodup
 )
 
 # Start training on the Switchboard subset, which has cleaner alignments
-
 steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang_nopp exp/mono0a 
+  data/train_10k_nodup data/lang_nosp exp/mono0a 
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-   data/train_30k_nodup data/lang_nopp exp/mono0a exp/mono0a_ali || exit 1;
+   data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
 
 steps/train_deltas.sh --cmd "$train_cmd" \
-    3200 30000 data/train_30k_nodup data/lang_nopp exp/mono0a_ali exp/tri1a || exit 1;
+    3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1;
 #used to be 2500 20000
 (
  graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg
diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh
index 6e2777b595b..71dd849a93b 100755
--- a/egs/gale_arabic/s5/cmd.sh
+++ b/egs/gale_arabic/s5/cmd.sh
@@ -1,11 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh
index 6e2777b595b..2d51ad82004 100755
--- a/egs/gale_mandarin/s5/cmd.sh
+++ b/egs/gale_mandarin/s5/cmd.sh
@@ -1,11 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated, but it's still used in this example
+# directory.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh
index 2a46d89f385..71dd849a93b 100644
--- a/egs/hkust/s5/cmd.sh
+++ b/egs/hkust/s5/cmd.sh
@@ -1,13 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh
index 6395d96ca36..71dd849a93b 100644
--- a/egs/librispeech/s5/cmd.sh
+++ b/egs/librispeech/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh
index 1cb03b04ffe..4a542cc30c0 100755
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@@ -2,7 +2,7 @@
 
 
 # Set this to somewhere where you want to put your data, or where
-# someone else has already put it.  You'll want to change this 
+# someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
 data=/export/a15/vpanayotov/data
 
@@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data
 data_url=www.openslr.org/resources/12
 lm_url=www.openslr.org/resources/11
 
-. cmd.sh
-. path.sh
+. ./cmd.sh
+. ./path.sh
 
 # you might not want to do this for interactive shells.
 set -e
@@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do
 done
 
 # download the LM resources
-local/download_lm.sh $lm_url data/local/lm || exit 1
+local/download_lm.sh $lm_url data/local/lm
 
 # format the data as Kaldi data directories
 for part in dev-clean test-clean dev-other test-other train-clean-100; do
   # use underscore-separated names in data directories.
-  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1
+  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
 done
 
 ## Optional text corpus normalization and LM training
@@ -39,7 +39,7 @@ done
 ## well as some intermediate data(e.g. the normalized text used for LM training),
 ## are available for download at http://www.openslr.org/11/
 #local/lm/train_lm.sh $LM_CORPUS_ROOT \
-#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1
+#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
 
 ## Optional G2P training scripts.
 ## As the LM training scripts above, this script is intended primarily to
@@ -49,24 +49,24 @@ done
 # when "--stage 3" option is used below we skip the G2P steps, and use the
 # lexicon we have already downloaded from openslr.org/11/
 local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
-   data/local/lm data/local/lm data/local/dict_nosp || exit 1
+   data/local/lm data/local/lm data/local/dict_nosp
 
 utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp
 
-local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1
+local/format_lms.sh --src-dir data/lang_nosp data/local/lm
 
 # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
 utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_tglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_tglarge
 utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_fglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_fglarge
 
 mfccdir=mfcc
 # spread the mfccs over various machines, as this data-set is quite large.
-if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
   mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
-  utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
+  utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
     $mfccdir/storage
 fi
 
@@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
 
 # train a monophone system
 steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
-  data/train_2kshort data/lang_nosp exp/mono || exit 1;
+  data/train_2kshort data/lang_nosp exp/mono
 
 # decode using the monophone model
 (
   utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
-    exp/mono exp/mono/graph_nosp_tgsmall || exit 1
+    exp/mono exp/mono/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
-      data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1
+      data/$test exp/mono/decode_nosp_tgsmall_$test
   done
 )&
 
@@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
 
 # train a first delta + delta-delta triphone system on a subset of 5000 utterances
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
-    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1;
+    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
 
 # decode using the tri1 model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1;
+    exp/tri1 exp/tri1/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
-      data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
 
 
 # train an LDA+MLLT system.
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1;
+   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
 
 # decode using the LDA+MLLT model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1;
+    exp/tri2b exp/tri2b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
-      data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 # Align a 10k utts subset using the tri2b model
 steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
-  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
 
 # Train tri3b, which is LDA+MLLT+SAT on 10k utts
 steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
 
 # decode using the tri3b model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1;
+    exp/tri3b exp/tri3b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri3b/graph_nosp_tgsmall data/$test \
-      exp/tri3b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri3b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 # align the entire train_clean_100 subset using the tri3b model
 steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
   data/train_clean_100 data/lang_nosp \
-  exp/tri3b exp/tri3b_ali_clean_100 || exit 1;
+  exp/tri3b exp/tri3b_ali_clean_100
 
 # train another LDA+MLLT+SAT system on the entire 100 hour subset
 steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
   data/train_clean_100 data/lang_nosp \
-  exp/tri3b_ali_clean_100 exp/tri4b || exit 1;
+  exp/tri3b_ali_clean_100 exp/tri4b
 
 # decode using the tri4b model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1;
+    exp/tri4b exp/tri4b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri4b/graph_nosp_tgsmall data/$test \
-      exp/tri4b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri4b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
   done
 )&
 
@@ -205,125 +205,125 @@ steps/get_prons.sh --cmd "$train_cmd" \
 utils/dict_dir_add_pronprobs.sh --max-normalize true \
   data/local/dict_nosp \
   exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
 
 utils/prepare_lang.sh data/local/dict \
   "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
 local/format_lms.sh --src-dir data/lang data/local/lm
 
 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1;
+  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1;
+  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
 
 # decode using the tri4b model with pronunciation and silence probabilities
 (
   utils/mkgraph.sh \
-    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1;
+    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri4b/graph_tgsmall data/$test \
-      exp/tri4b/decode_tgsmall_$test || exit 1;
+      exp/tri4b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
 # align train_clean_100 using the tri4b model
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1;
+  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
 
 # if you want at this point you can train and test NN model(s) on the 100 hour
 # subset
-local/nnet2/run_5a_clean_100.sh || exit 1
+local/nnet2/run_5a_clean_100.sh
 
-local/download_and_untar.sh $data $data_url train-clean-360 || exit 1;
+local/download_and_untar.sh $data $data_url train-clean-360
 
 # now add the "clean-360" subset to the mix ...
 local/data_prep.sh \
-  $data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1
+  $data/LibriSpeech/train-clean-360 data/train_clean_360
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
-  exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  exp/make_mfcc/train_clean_360 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
 
 # ... and then combine the two sets into a 460 hour one
 utils/combine_data.sh \
-  data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1
+  data/train_clean_460 data/train_clean_100 data/train_clean_360
 
 # align the new, combined set, using the tri4b model
 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1;
+  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
 
 # create a larger SAT model, trained on the 460 hours of data.
 steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
-  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1;
+  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
 
 # decode using the tri5b model
 (
   utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri5b exp/tri5b/graph_tgsmall || exit 1;
+    exp/tri5b exp/tri5b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri5b/graph_tgsmall data/$test \
-      exp/tri5b/decode_tgsmall_$test || exit 1;
+      exp/tri5b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
 # train a NN model on the 460 hour set
-local/nnet2/run_6a_clean_460.sh || exit 1
+local/nnet2/run_6a_clean_460.sh
 
-local/download_and_untar.sh $data $data_url train-other-500 || exit 1;
+local/download_and_untar.sh $data $data_url train-other-500
 
 # prepare the 500 hour subset.
 local/data_prep.sh \
-  $data/LibriSpeech/train-other-500 data/train_other_500 || exit 1
+  $data/LibriSpeech/train-other-500 data/train_other_500
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
-  exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  exp/make_mfcc/train_other_500 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
 
 # combine all the data
 utils/combine_data.sh \
-  data/train_960 data/train_clean_460 data/train_other_500 || exit 1
+  data/train_960 data/train_clean_460 data/train_other_500
 
 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1;
+  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
 
 # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
 # as it is faster.
 steps/train_quick.sh --cmd "$train_cmd" \
-  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1;
+  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
 
 # decode using the tri6b model
 (
   utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri6b exp/tri6b/graph_tgsmall || exit 1;
+    exp/tri6b exp/tri6b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
-      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1;
+      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
@@ -349,7 +349,7 @@ steps/train_quick.sh --cmd "$train_cmd" \
 
 
 # train NN models on the entire dataset
-local/nnet2/run_7a_960.sh || exit 1
+local/nnet2/run_7a_960.sh
 
 # # train models on cleaned-up data
 # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre/v1/cmd.sh
+++ b/egs/lre/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre07/v1/cmd.sh
+++ b/egs/lre07/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS
index 031a6b2ec1a..3537852a827 100644
--- a/egs/reverb/s5/RESULTS
+++ b/egs/reverb/s5/RESULTS
@@ -1,306 +1,150 @@
-local/summarize_results.pl tri2a
-#### RESULTS FOR dt ##### 
-
-exp/tri2a/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	89.00
-RealData_dt_for_1ch_near_room1_A	90.39
-SimData_dt_for_1ch_far_room1_A	22.35
-SimData_dt_for_1ch_far_room2_A	88.37
-SimData_dt_for_1ch_far_room3_A	90.85
-SimData_dt_for_1ch_near_room1_A	12.29
-SimData_dt_for_1ch_near_room2_A	42.86
-SimData_dt_for_1ch_near_room3_A	50.17
-Avg_Sim(6)	51.15
-Avg_Real(2)	89.69
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a/decode_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2a_mc
-#### RESULTS FOR dt ##### 
-
-exp/tri2a_mc/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	51.88
-RealData_dt_for_1ch_near_room1_A	56.14
-SimData_dt_for_1ch_far_room1_A	17.45
-SimData_dt_for_1ch_far_room2_A	44.02
-SimData_dt_for_1ch_far_room3_A	49.90
-SimData_dt_for_1ch_near_room1_A	15.29
-SimData_dt_for_1ch_near_room2_A	22.11
-SimData_dt_for_1ch_near_room3_A	26.34
-Avg_Sim(6)	29.18
-Avg_Real(2)	54.01
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a_mc/decode_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2a_mc basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	43.95
-RealData_dt_for_1ch_near_room1_A	48.91
-SimData_dt_for_1ch_far_room1_A	16.37
-SimData_dt_for_1ch_far_room2_A	35.67
-SimData_dt_for_1ch_far_room3_A	39.59
-SimData_dt_for_1ch_near_room1_A	13.03
-SimData_dt_for_1ch_near_room2_A	17.08
-SimData_dt_for_1ch_near_room3_A	20.00
+####################
+exp/tri2a/decode_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	89.13
+RealData_dt_for_1ch_near_room1_A	90.27
+SimData_dt_for_1ch_far_room1_A	22.44
+SimData_dt_for_1ch_far_room2_A	88.44
+SimData_dt_for_1ch_far_room3_A	91.27
+SimData_dt_for_1ch_near_room1_A	12.19
+SimData_dt_for_1ch_near_room2_A	42.74
+SimData_dt_for_1ch_near_room3_A	49.31
+Avg_Real(2)	89.70
+Avg_Sim(6)	51.06
+
+exp/tri2a/decode_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	88.45
+RealData_et_for_1ch_near_room1_A	88.66
+SimData_et_for_1ch_far_room1_A	22.72
+SimData_et_for_1ch_far_room2_A	81.53
+SimData_et_for_1ch_far_room3_A	89.25
+SimData_et_for_1ch_near_room1_A	14.37
+SimData_et_for_1ch_near_room2_A	40.46
+SimData_et_for_1ch_near_room3_A	51.50
+Avg_Real(2)	88.56
+Avg_Sim(6)	49.97
+
+####################
+exp/tri2a_mc/decode_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	53.38
+RealData_dt_for_1ch_near_room1_A	56.27
+SimData_dt_for_1ch_far_room1_A	16.96
+SimData_dt_for_1ch_far_room2_A	44.15
+SimData_dt_for_1ch_far_room3_A	49.88
+SimData_dt_for_1ch_near_room1_A	15.00
+SimData_dt_for_1ch_near_room2_A	21.81
+SimData_dt_for_1ch_near_room3_A	25.10
+Avg_Real(2)	54.83
+Avg_Sim(6)	28.82
+
+exp/tri2a_mc/decode_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	52.94
+RealData_et_for_1ch_near_room1_A	55.35
+SimData_et_for_1ch_far_room1_A	18.91
+SimData_et_for_1ch_far_room2_A	37.33
+SimData_et_for_1ch_far_room3_A	46.69
+SimData_et_for_1ch_near_room1_A	17.77
+SimData_et_for_1ch_near_room2_A	21.23
+SimData_et_for_1ch_near_room3_A	26.17
+Avg_Real(2)	54.14
+Avg_Sim(6)	28.02
+
+####################
+exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	46.27
+RealData_dt_for_1ch_near_room1_A	48.85
+SimData_dt_for_1ch_far_room1_A	15.59
+SimData_dt_for_1ch_far_room2_A	35.86
+SimData_dt_for_1ch_far_room3_A	39.54
+SimData_dt_for_1ch_near_room1_A	12.78
+SimData_dt_for_1ch_near_room2_A	17.75
+SimData_dt_for_1ch_near_room3_A	20.23
+Avg_Real(2)	47.56
 Avg_Sim(6)	23.62
-Avg_Real(2)	46.43
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2b
-#### RESULTS FOR dt ##### 
-
-exp/tri2b/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	91.66
-RealData_dt_for_1ch_near_room1_A	91.33
-SimData_dt_for_1ch_far_room1_A	26.94
-SimData_dt_for_1ch_far_room2_A	85.63
-SimData_dt_for_1ch_far_room3_A	91.99
-SimData_dt_for_1ch_near_room1_A	11.95
-SimData_dt_for_1ch_near_room2_A	34.51
-SimData_dt_for_1ch_near_room3_A	44.81
-Avg_Sim(6)	49.30
-Avg_Real(2)	91.50
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	91.29
-RealData_et_for_1ch_near_room1_A	92.05
-SimData_et_for_1ch_far_room1_A	24.16
-SimData_et_for_1ch_far_room2_A	78.57
-SimData_et_for_1ch_far_room3_A	91.01
-SimData_et_for_1ch_near_room1_A	13.76
-SimData_et_for_1ch_near_room2_A	32.94
-SimData_et_for_1ch_near_room3_A	48.24
-Avg_Sim(6)	48.11
-Avg_Real(2)	91.67
-
-
-local/summarize_results.pl tri2b_mc
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	45.18
-RealData_dt_for_1ch_near_room1_A	49.91
-SimData_dt_for_1ch_far_room1_A	15.78
-SimData_dt_for_1ch_far_room2_A	34.75
-SimData_dt_for_1ch_far_room3_A	37.56
-SimData_dt_for_1ch_near_room1_A	13.45
-SimData_dt_for_1ch_near_room2_A	17.57
-SimData_dt_for_1ch_near_room3_A	19.49
-Avg_Sim(6)	23.10
-Avg_Real(2)	47.55
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	47.67
-RealData_et_for_1ch_near_room1_A	50.65
-SimData_et_for_1ch_far_room1_A	16.69
-SimData_et_for_1ch_far_room2_A	30.36
-SimData_et_for_1ch_far_room3_A	38.08
-SimData_et_for_1ch_near_room1_A	15.67
-SimData_et_for_1ch_near_room2_A	17.71
-SimData_et_for_1ch_near_room3_A	20.10
-Avg_Sim(6)	23.10
-Avg_Real(2)	49.16
-
-
-local/summarize_results.pl tri2b_mc basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	39.37
-RealData_dt_for_1ch_near_room1_A	42.48
-SimData_dt_for_1ch_far_room1_A	14.11
-SimData_dt_for_1ch_far_room2_A	28.81
-SimData_dt_for_1ch_far_room3_A	31.53
-SimData_dt_for_1ch_near_room1_A	11.18
-SimData_dt_for_1ch_near_room2_A	15.01
-SimData_dt_for_1ch_near_room3_A	15.48
-Avg_Sim(6)	19.35
-Avg_Real(2)	40.92
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	42.03
-RealData_et_for_1ch_near_room1_A	43.53
-SimData_et_for_1ch_far_room1_A	13.87
-SimData_et_for_1ch_far_room2_A	26.02
-SimData_et_for_1ch_far_room3_A	32.80
-SimData_et_for_1ch_near_room1_A	12.42
-SimData_et_for_1ch_near_room2_A	14.82
-SimData_et_for_1ch_near_room3_A	17.02
-Avg_Sim(6)	19.49
-Avg_Real(2)	42.78
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	43.06
-RealData_dt_for_1ch_near_room1_A	46.04
-SimData_dt_for_1ch_far_room1_A	13.59
-SimData_dt_for_1ch_far_room2_A	29.55
-SimData_dt_for_1ch_far_room3_A	32.52
-SimData_dt_for_1ch_near_room1_A	11.21
-SimData_dt_for_1ch_near_room2_A	15.23
-SimData_dt_for_1ch_near_room3_A	16.42
-Avg_Sim(6)	19.75
-Avg_Real(2)	44.55
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	43.45
-RealData_et_for_1ch_near_room1_A	46.89
-SimData_et_for_1ch_far_room1_A	13.37
-SimData_et_for_1ch_far_room2_A	25.96
-SimData_et_for_1ch_far_room3_A	31.73
-SimData_et_for_1ch_near_room1_A	11.89
-SimData_et_for_1ch_near_room2_A	14.64
-SimData_et_for_1ch_near_room3_A	17.26
-Avg_Sim(6)	19.14
-Avg_Real(2)	45.17
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	36.98
-RealData_dt_for_1ch_near_room1_A	39.68
-SimData_dt_for_1ch_far_room1_A	11.43
-SimData_dt_for_1ch_far_room2_A	25.24
-SimData_dt_for_1ch_far_room3_A	27.77
-SimData_dt_for_1ch_near_room1_A	9.19
-SimData_dt_for_1ch_near_room2_A	12.77
-SimData_dt_for_1ch_near_room3_A	13.30
-Avg_Sim(6)	16.62
-Avg_Real(2)	38.33
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	38.93
-RealData_et_for_1ch_near_room1_A	39.51
-SimData_et_for_1ch_far_room1_A	11.32
-SimData_et_for_1ch_far_room2_A	22.31
-SimData_et_for_1ch_far_room3_A	28.40
-SimData_et_for_1ch_near_room1_A	9.69
-SimData_et_for_1ch_near_room2_A	12.36
-SimData_et_for_1ch_near_room3_A	14.77
-Avg_Sim(6)	16.47
-Avg_Real(2)	39.22
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	31.58
-RealData_dt_for_1ch_near_room1_A	32.00
-SimData_dt_for_1ch_far_room1_A	8.51
-SimData_dt_for_1ch_far_room2_A	18.36
-SimData_dt_for_1ch_far_room3_A	20.40
-SimData_dt_for_1ch_near_room1_A	6.47
-SimData_dt_for_1ch_near_room2_A	9.61
-SimData_dt_for_1ch_near_room3_A	9.59
-Avg_Sim(6)	12.16
-Avg_Real(2)	31.79
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	30.32
-RealData_et_for_1ch_near_room1_A	32.45
-SimData_et_for_1ch_far_room1_A	7.74
-SimData_et_for_1ch_far_room2_A	17.01
-SimData_et_for_1ch_far_room3_A	21.05
-SimData_et_for_1ch_near_room1_A	7.01
-SimData_et_for_1ch_near_room2_A	9.52
-SimData_et_for_1ch_near_room3_A	11.29
-Avg_Sim(6)	12.27
-Avg_Real(2)	31.39
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 mbr_basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	30.96
-RealData_dt_for_1ch_near_room1_A	30.88
-SimData_dt_for_1ch_far_room1_A	8.33
-SimData_dt_for_1ch_far_room2_A	18.14
-SimData_dt_for_1ch_far_room3_A	20.15
-SimData_dt_for_1ch_near_room1_A	6.24
-SimData_dt_for_1ch_near_room2_A	9.47
-SimData_dt_for_1ch_near_room3_A	9.62
-Avg_Sim(6)	11.99
-Avg_Real(2)	30.92
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	29.37
-RealData_et_for_1ch_near_room1_A	31.84
-SimData_et_for_1ch_far_room1_A	7.64
-SimData_et_for_1ch_far_room2_A	16.86
-SimData_et_for_1ch_far_room3_A	20.59
-SimData_et_for_1ch_near_room1_A	6.93
-SimData_et_for_1ch_near_room2_A	9.48
-SimData_et_for_1ch_near_room3_A	11.19
-Avg_Sim(6)	12.11
-Avg_Real(2)	30.61
 
+exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	48.11
+RealData_et_for_1ch_near_room1_A	48.42
+SimData_et_for_1ch_far_room1_A	16.57
+SimData_et_for_1ch_far_room2_A	31.54
+SimData_et_for_1ch_far_room3_A	39.32
+SimData_et_for_1ch_near_room1_A	14.31
+SimData_et_for_1ch_near_room2_A	18.42
+SimData_et_for_1ch_near_room3_A	21.03
+Avg_Real(2)	48.27
+Avg_Sim(6)	23.53
+
+####################
+exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	34.04
+RealData_dt_for_1ch_near_room1_A	33.37
+SimData_dt_for_1ch_far_room1_A	10.57
+SimData_dt_for_1ch_far_room2_A	22.63
+SimData_dt_for_1ch_far_room3_A	25.00
+SimData_dt_for_1ch_near_room1_A	7.57
+SimData_dt_for_1ch_near_room2_A	10.97
+SimData_dt_for_1ch_near_room3_A	12.59
+Avg_Real(2)	33.70
+Avg_Sim(6)	14.89
+
+exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	33.49
+RealData_et_for_1ch_near_room1_A	34.72
+SimData_et_for_1ch_far_room1_A	10.03
+SimData_et_for_1ch_far_room2_A	20.16
+SimData_et_for_1ch_far_room3_A	25.08
+SimData_et_for_1ch_near_room1_A	8.45
+SimData_et_for_1ch_near_room2_A	11.16
+SimData_et_for_1ch_near_room3_A	12.88
+Avg_Real(2)	34.11
+Avg_Sim(6)	14.63
+
+####################
+exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	31.17
+RealData_dt_for_1ch_near_room1_A	31.82
+SimData_dt_for_1ch_far_room1_A	8.53
+SimData_dt_for_1ch_far_room2_A	17.43
+SimData_dt_for_1ch_far_room3_A	21.04
+SimData_dt_for_1ch_near_room1_A	6.78
+SimData_dt_for_1ch_near_room2_A	8.97
+SimData_dt_for_1ch_near_room3_A	10.01
+Avg_Real(2)	31.50
+Avg_Sim(6)	12.13
+
+exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	31.20
+RealData_et_for_1ch_near_room1_A	30.98
+SimData_et_for_1ch_far_room1_A	8.42
+SimData_et_for_1ch_far_room2_A	17.63
+SimData_et_for_1ch_far_room3_A	20.71
+SimData_et_for_1ch_near_room1_A	7.03
+SimData_et_for_1ch_near_room2_A	9.50
+SimData_et_for_1ch_near_room3_A	11.11
+Avg_Real(2)	31.09
+Avg_Sim(6)	12.40
+
+####################
+exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	30.42
+RealData_dt_for_1ch_near_room1_A	31.50
+SimData_dt_for_1ch_far_room1_A	8.24
+SimData_dt_for_1ch_far_room2_A	17.25
+SimData_dt_for_1ch_far_room3_A	20.72
+SimData_dt_for_1ch_near_room1_A	6.76
+SimData_dt_for_1ch_near_room2_A	8.87
+SimData_dt_for_1ch_near_room3_A	9.92
+Avg_Real(2)	30.96
+Avg_Sim(6)	11.96
+
+exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	30.89
+RealData_et_for_1ch_near_room1_A	31.01
+SimData_et_for_1ch_far_room1_A	8.20
+SimData_et_for_1ch_far_room2_A	17.34
+SimData_et_for_1ch_far_room3_A	20.56
+SimData_et_for_1ch_near_room1_A	6.91
+SimData_et_for_1ch_near_room2_A	9.50
+SimData_et_for_1ch_near_room3_A	10.93
+Avg_Real(2)	30.95
+Avg_Sim(6)	12.24
 
diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh
index e88b07e1195..71dd849a93b 100644
--- a/egs/reverb/s5/cmd.sh
+++ b/egs/reverb/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/reverb/s5/corpus.sh b/egs/reverb/s5/corpus.sh
deleted file mode 100644
index 32a2ee4b85b..00000000000
--- a/egs/reverb/s5/corpus.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
-  REVERB_home=/export/corpora5/REVERB_2014/REVERB
-  export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0
-  # set LDC WSJ0 directory to obtain LMs 
-  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
-  export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B
-  # It is assumed that there will be a 'wsj0' subdirectory
-  # within the top-level corpus directory
-else
-  echo "Set the data directory locations." && exit 1;
-fi
-
-export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt
-export reverb_et=$REVERB_home/REVERB_WSJCAM0_et
-export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev
-export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval
-
diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m
old mode 100644
new mode 100755
diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
index c3de2ba7fd3..a4599f97702 100755
--- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
+++ b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
@@ -65,8 +65,8 @@ if [ ! -z "$3" ]; then
    dt_or_x=$3
 fi
 
-# unfortunately, we need a pointer to HTK baseline 
-# since the corpus does NOT contain the data set descriptions 
+# unfortunately, we need a pointer to HTK baseline
+# since the corpus does NOT contain the data set descriptions
 # for the REVERB Challenge
 
 taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
@@ -97,11 +97,11 @@ s/\x0D$//' \
             # e.g. yield' --> yield
             # reason: YIELD' is not in dict, while YIELD is
             s/YIELD'/YIELD/g
-            s/'ROOTS'/ROOTS/g 
-            s/'WHERE/WHERE/g 
+            s/'ROOTS'/ROOTS/g
+            s/'WHERE/WHERE/g
             s/PEOPLE'/PEOPLE/g
             s/SIT'/SIT/g
-            s/'DOMINEE/DOMINEE/g 
+            s/'DOMINEE/DOMINEE/g
             s/CHURCH'/CHURCH/g" \
 -e '
               # fix the single missing double full stop issue at the end of an utterance
@@ -110,9 +110,9 @@ s/\x0D$//' \
               /^[A-Z]$/ {
               # append a line
                       N
-              # search for single dot on the second line        
+              # search for single dot on the second line
                       /\n\./ {
-              # found it - now replace the 
+              # found it - now replace the
                               s/\([A-Z]\)\n\./\1\.\n\./
                       }
               }' \
@@ -156,9 +156,9 @@ echo "Data preparation for $set succeeded"
 
 
 mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do 
-#for x in si_tr; do 
-steps/make_mfcc.sh --nj 10 \
+#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
+#for x in si_tr; do
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
   data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 
diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
index 2c169e84b59..6ab2f2f4b73 100755
--- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
+++ b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
@@ -50,8 +50,8 @@ fi
 cd $dir
 MIC=primary
 
-# unfortunately, we need a pointer to HTK baseline 
-# since the corpus does NOT contain the data set descriptions 
+# unfortunately, we need a pointer to HTK baseline
+# since the corpus does NOT contain the data set descriptions
 # for the REVERB Challenge
 taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
 #taskFiles=`ls $taskFileDir/*Data_dt_for_*`
@@ -108,9 +108,9 @@ echo "Data preparation for $set succeeded"
 
 
 mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do 
-#for x in si_tr; do 
-steps/make_mfcc.sh --nj 10 \
+#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
+#for x in si_tr; do
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
   data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 
diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh
new file mode 100755
index 00000000000..c4b5eeb87f3
--- /dev/null
+++ b/egs/reverb/s5/local/calc_wer.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2016 MERL (author: Shinji Watanabe)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+. ./cmd.sh
+. ./path.sh
+
+lmw=15
+am="tri2a"
+lm="bg_5k"
+decode=""
+
+. utils/parse_options.sh
+
+if [ ! -z $decode ]; then
+  decode="_$decode"
+fi
+
+dir="exp/$am/decode${decode}_${lm}_REVERB_"
+echo "####################"
+echo "${dir}*dt*"
+for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do
+  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
+  grep WER $a/wer_${lmw} | awk '{print $2}'
+done | tee exp/$am/decode_${decode}_${lm}_dt.log
+echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo ""
+
+echo "${dir}*et*"
+for a in `echo ${dir}*et* | tr " " "\n" | grep -v "A\.si"`; do
+  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
+  grep WER $a/wer_${lmw} | awk '{print $2}'
+done | tee exp/$am/decode_${decode}_${lm}_et.log
+echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+
+echo ""
diff --git a/egs/reverb/s5/local/get_results.sh b/egs/reverb/s5/local/get_results.sh
index 6327fcb088a..7c74736e5d1 100755
--- a/egs/reverb/s5/local/get_results.sh
+++ b/egs/reverb/s5/local/get_results.sh
@@ -1,26 +1,18 @@
 #!/bin/bash
 
-# Reproduce results in Table 1 from Weninger et al. (2014)
+# Reproduce selected results in Table 1 from Weninger et al. (2014)
 # "Our baselines"
 
 # LDA-STC  fMLLR  MCT    DT     LM     MBR
 # No       No     No     No     BG     No
-local/summarize_results.pl --lmw=15 tri2a
+local/calc_wer.sh
 # No       No     Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2a_mc
+local/calc_wer.sh --am tri2a_mc
 # No       Yes    Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2a_mc basis_fmllr
-# Yes      No     No     No     BG     No
-local/summarize_results.pl --lmw=15 tri2b
-# Yes      No     Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc
-# Yes      Yes    Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc basis_fmllr
-# Yes      No     Yes    Yes    BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc_mmi_b0.1
-# Yes      Yes    Yes    Yes    BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc_mmi_b0.1 basis_fmllr
+local/calc_wer.sh --am tri2a_mc --decode basis_fmllr
+# Yes      Yes    Yes    No     TG     No
+local/calc_wer.sh --am tri2b_mc --lm tg_5k --decode basis_fmllr
 # Yes      Yes    Yes    Yes    TG     No
-local/summarize_results.pl --lm=tg_5k --lmw=15 tri2b_mc_mmi_b0.1 basis_fmllr
+local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode basis_fmllr
 # Yes      Yes    Yes    Yes    TG     Yes
-local/summarize_results.pl --lm=tg_5k --lmw=15 tri2b_mc_mmi_b0.1 mbr_basis_fmllr
+local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode mbr_basis_fmllr
diff --git a/egs/reverb/s5/local/summarize_results.pl b/egs/reverb/s5/local/summarize_results.pl
deleted file mode 100755
index 0977bd2dacc..00000000000
--- a/egs/reverb/s5/local/summarize_results.pl
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2013 MERL (author: Felix Weninger)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-use strict;
-
-my $opt_lmw;
-my $lm = "bg_5k";
-
-while ($#ARGV > -1) {
-    if ($ARGV[0] =~ /^--lmw=(\d+)$/)
-    {
-        $opt_lmw = $1 + 0;
-        shift @ARGV;
-    }
-    elsif ($ARGV[0] =~ /^--lm=(\w+)$/) {
-        $lm = $1;
-        shift @ARGV;
-    }
-    else {
-        last;
-    }
-}
-
-
-print "$0 @ARGV\n";
-
-my $system = "tri2b_mc";
-if ($ARGV[0] ne "") { $system = $ARGV[0]; }
-
-for my $dt_or_et ("dt", "et") {
-
-print "#### RESULTS FOR $dt_or_et ##### \n\n";
-
-my $pref = "REVERB_$dt_or_et";
-#if ($lm ne "bg_5k") {
-$pref = "${lm}_$pref";
-#}
-if ($ARGV[1] ne "") { $pref = $ARGV[1] . '_' . $pref; }
-if ($ARGV[2] ne "") { $pref = $pref . '_' . $ARGV[2]; }
-
-my $suff = "";
-
-print "exp/$system/decode_$suff$pref*\n";
-my @folders = glob("exp/$system/decode_$suff$pref*");
-
-my ($min_lmw, $max_lmw) = (9, 20);
-@folders = grep { -f "$_/wer_$min_lmw" } @folders;
-my @sum_wer;
-my %wer;
-my %avg_wer_disp;
-my $nc = 0;
-my $ns = 0;
-my $nr = 0;
-for my $lmw ($min_lmw..$max_lmw)
-{
-    for my $fold (@folders) {
-        my $res_file = "$fold/wer_$lmw";
-        #print "fold = $fold pref = $pref\n";
-        #my ($cond) = $fold =~ /decode_(\w+)$/;
-        my ($cond) = $fold =~ /decode_\Q$suff\E\Q${pref}\E_(\w+)$/;
-        if ($cond =~ /^Sim.+(far|near|cln)|^Real/) {
-            open(RES, $res_file) or die "$res_file: $_";
-            while (<RES>) {
-                if (/%WER\s+(\S+)/) {
-                    my $wer = $1;
-                    #print "cond = $cond lmw = $lmw wer = $1\n";
-                    if ($cond !~ /cln/) {
-                        $sum_wer[$lmw] += $wer;
-                    }
-                    $wer{$cond}[$lmw] = $wer;
-                }
-            }
-            #print "cond = $cond fold = $fold\n";
-        }
-    }   
-}
-
-if (!$opt_lmw && $dt_or_et eq "dt") {
-    $opt_lmw = $min_lmw;
-    for my $lmw ($min_lmw+1..$max_lmw) {
-        if ($sum_wer[$lmw] < $sum_wer[$opt_lmw]) {
-            $opt_lmw = $lmw;
-        }
-    }
-}
-
-print "LMW = $opt_lmw\n";
-for my $cond (sort keys %wer) {
-    print "$cond\t$wer{$cond}[$opt_lmw]\n";
-    if ($cond =~ /SimData_[de]t/) {
-        if ($cond !~ /cln/) {
-            $avg_wer_disp{"SimData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"SimData"}) / ++$ns;
-        }
-        else {
-            $avg_wer_disp{"CleanData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"CleanData"}) / ++$nc;
-        }
-    }
-    elsif ($cond =~ /RealData_[de]t/) {
-        $avg_wer_disp{"RealData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"RealData"}) / ++$nr;
-    }
-}
-
-#print "Avg_Clean($nc)\t", sprintf("%.2f", $avg_wer_disp{"CleanData"}), "\n";
-print "Avg_Sim($ns)\t", sprintf("%.2f", $avg_wer_disp{"SimData"}), "\n";
-print "Avg_Real($nr)\t", sprintf("%.2f", $avg_wer_disp{"RealData"}), "\n";
-print "\n\n";
-
-}
diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh
index 0e3eac6e6c1..ffb0b20422d 100755
--- a/egs/reverb/s5/run.sh
+++ b/egs/reverb/s5/run.sh
@@ -15,89 +15,92 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
 # Requirements) matlab and tcsh
 if [ ! `which tcsh` ]; then
-    echo "Install tcsh, which is used in some REVERB scripts"
-    exit 1
+  echo "Install tcsh, which is used in some REVERB scripts"
+  exit 1
 fi
 if [ ! `which matlab` ]; then
-    echo "Install matlab, which is used to generate multi-condition data"
-    exit 1
+  echo "Install matlab, which is used to generate multi-condition data"
+  exit 1
 fi
 
-if [ ! -e path.sh ] || [ ! -e corpus.sh ]; then
-    echo "ERROR: path.sh and/or corpus.sh not found"
-    echo "You need to create these from {path,corpus}.sh.default to match your system"
-    echo "Make sure you follow the instructions in ../README.txt"
-    exit 1
+. ./cmd.sh
+. ./path.sh
+
+stage=1
+. utils/parse_options.sh
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
+
+# please make sure to set the paths of the REVERB and WSJ0 data
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
+  REVERB_home=/export/corpora5/REVERB_2014/REVERB
+  export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0
+  # set LDC WSJ0 directory to obtain LMs
+  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
+  export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B
+  # It is assumed that there will be a 'wsj0' subdirectory
+  # within the top-level corpus directory
+elif [[ $(hostname -f) == *.merl.com ]] ; then
+  REVERB_home=/db/laputa1/data/original/public/REVERB
+  export wsjcam0=$REVERB_home/wsjcam0
+  # set LDC WSJ0 directory to obtain LMs
+  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
+  export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B
+  # It is assumed that there will be a 'wsj0' subdirectory
+  # within the top-level corpus directory
+else
+  echo "Set the data directory locations." && exit 1;
 fi
+export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt
+export reverb_et=$REVERB_home/REVERB_WSJCAM0_et
+export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev
+export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval
 
-. ./cmd.sh 
-
-# please make sure to set the paths of the REVERB and WSJ0 data           
-. ./corpus.sh
-
-# set the directory of the multi-condition training data generated
+# set the directory of the multi-condition training data to be generated
 reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut
 
 # LDA context size (left/right) (4 is default)
 context_size=4
 
-# The language models with which to decode (tg_5k or bg_5k or "tg_5k bg_5k" for
-# both)
-lms="bg_5k tg_5k"
+# The language models with which to decode (tg_5k or bg_5k)
+lm="tg_5k"
 
 # number of jobs for feature extraction and model training
 nj_train=30
 
 # number of jobs for decoding
-# use less jobs for trigram model
-# if you have enough RAM (~ 32 GB), you can use 8 jobs for trigram as well
-nj_bg=8
-nj_tg=8
-nj_bg=25 ##
-nj_tg=25 ##
-
-# set to true if running from scratch
-do_prep=true
+nj_decode=8
 
 # set to true if you want the tri2a systems (re-implementation of the HTK baselines)
 do_tri2a=true
 
-
-# The following are the settings determined by Gaussian Process optimization.
-# However, they are not used in the final system.
-# You can use the code below for training the "tri2c_mc" system.
-
-# LDA parameters for MCT recognizer.
-# Use significantly more context than the default (7 frames ~ 85 ms)
-mct_lda_left_context=7
-mct_lda_right_context=5
-
-# Number of states and Gaussians for the MCT recognizer.
-mct_nstates=7500
-mct_ngauss=45000
-
-## End of GP tuned settings
-
-false && {
-if $do_prep; then
+if [ $stage -le 1 ]; then
   # Generate multi-condition training data
   # Note that utterance lengths match the original set.
   # This enables using clean alignments in multi-condition training (stereo training)
-  #local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr
+  local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr
+fi
 
+if [ $stage -le 2 ]; then
   # Prepare wsjcam0 clean data and wsj0 language model.
-  local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 || exit 1
+  local/wsjcam0_data_prep.sh $wsjcam0 $wsj0
 
   # Prepare merged BEEP/CMU dictionary.
-  local/wsj_prepare_beep_dict.sh || exit 1;
+  local/wsj_prepare_beep_dict.sh
 
   # Prepare wordlists, etc.
-  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
 
   # Prepare directory structure for clean data. Apply some language model fixes.
-  local/wsjcam0_format_data.sh || exit 1;
+  local/wsjcam0_format_data.sh
 
   # Now it's getting more interesting.
   # Prepare the multi-condition training data and the REVERB dt set.
@@ -108,253 +111,227 @@ if $do_prep; then
   # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt
   # The first argument is supposed to point to a folder that has the same structure
   # as the REVERB corpus.
-  local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr || exit 1;
-  local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt     || exit 1;
-  local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et     || exit 1;
+  local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr
+  local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt
+  local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et
 
   # Prepare the REVERB "real" dt set from MCWSJAV corpus.
   # This corpus is *never* used for training.
   # This creates the data set called REVERB_Real_dt and its subfolders
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt || exit 1;
+  local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt
   # The MLF file exists only once in the corpus, namely in the real_dt directory
   # so we pass it as 4th argument
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf || exit 1;
+  local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf
+fi
 
+if [ $stage -le 3 ]; then
   # Extract MFCC features for clean sets.
   # For the non-clean data sets, this is outsourced to the data preparation scripts.
   mfccdir=mfcc
   ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct.
-  for x in si_tr; do 
-   steps/make_mfcc.sh --nj $nj_train \
-     data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  for x in si_tr; do
+   steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \
+     data/$x exp/make_mfcc/$x $mfccdir
+   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
   done
 fi
 
-# Train monophone model on clean data (si_tr).
-if [ ! -e exp/mono0a/final.mdl ]; then
-    echo "### TRAINING mono0a ###"
-    steps/train_mono.sh --boost-silence 1.25 --nj $nj_train \
-      data/si_tr data/lang exp/mono0a || exit 1;
+if [ $stage -le 4 ]; then
+  # Train monophone model on clean data (si_tr).
+  echo "### TRAINING mono0a ###"
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/mono0a
+
+  # Align monophones with clean data.
+  echo "### ALIGNING mono0a_ali ###"
+  steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/mono0a exp/mono0a_ali
+
+  # Create first triphone recognizer.
+  echo "### TRAINING tri1 ###"
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1
+
+  echo "### ALIGNING tri1_ali ###"
+  # Re-align triphones.
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/tri1 exp/tri1_ali
 fi
 
-# Align monophones with clean data.
-if [ ! -e exp/mono0a_ali/ali.1.gz ]; then
-    echo "### ALIGNING mono0a_ali ###"
-    steps/align_si.sh --boost-silence 1.25 --nj $nj_train \
-       data/si_tr data/lang exp/mono0a exp/mono0a_ali || exit 1;
-fi
-
-# Create first triphone recognizer.
-if [ ! -e exp/tri1/final.mdl ]; then
-    echo "### TRAINING tri1 ###"
-    steps/train_deltas.sh --boost-silence 1.25 \
-        2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 || exit 1;
-fi
-
-# Prepare first triphone recognizer and decode clean si_dt for verification.
-#utils/mkgraph.sh data/lang_test_bg_5k exp/tri1 exp/tri1/graph_bg_5k || exit 1;
-#steps/decode.sh --nj 8 exp/tri1/graph_bg_5k data/si_dt exp/tri1/decode_si_dt
-
-if [ ! -e exp/tri1_ali/ali.1.gz ]; then
-    echo "### ALIGNING tri1_ali ###"
-    # Re-align triphones.
-    steps/align_si.sh --nj $nj_train \
-      data/si_tr data/lang exp/tri1 exp/tri1_ali || exit 1;
-fi
-
-
 # The following code trains and evaluates a delta feature recognizer, which is similar to the HTK
 # baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only.
 if $do_tri2a; then
+if [ $stage -le 5 ]; then
   # Train tri2a, which is deltas + delta-deltas, on clean data.
-  steps/train_deltas.sh \
-    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a
 
   # Re-align triphones using clean data. This gives a smallish performance gain.
-  steps/align_si.sh --nj $nj_train \
-    data/si_tr data/lang exp/tri2a exp/tri2a_ali || exit 1;
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/tri2a exp/tri2a_ali
 
   # Train a multi-condition triphone recognizer.
   # This uses alignments on *clean* data, which is allowed for REVERB.
-  # However, we have to use the "cut" version so that the length of the 
+  # However, we have to use the "cut" version so that the length of the
   # waveforms match.
   # It is actually asserted by the Challenge that clean and multi-condition waves are aligned.
-  steps/train_deltas.sh \
-    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc
 
   # Prepare clean and mc tri2a models for decoding.
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k
+  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k &
+  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k &
+  wait
+fi
 
+if [ $stage -le 6 ]; then
   # decode REVERB dt using tri2a, clean
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode.sh --nj $nj_bg \
-      exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
 
   # decode REVERB dt using tri2a, mc
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode.sh --nj $nj_bg \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
+
   # basis fMLLR for tri2a_mc system
   # This computes a transform for every training utterance and computes a basis from that.
-  steps/get_fmllr_basis.sh --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc || exit 1;
+  steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc
 
   # Recognition using fMLLR adaptation (per-utterance processing).
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode_basis_fmllr.sh --nj $nj_bg \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
-
-fi # train tri2a, tri2a_mc
-
-
-# Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe.
-if [ ! -e exp/tri2b/final.mdl ]; then
-    echo "### TRAINING tri2b ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$context_size --right-context=$context_size" \
-       2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b || exit 1;
+  wait
 fi
-
-# tri2b (LDA-MLLT system) with multi-condition training, using default parameters.
-if [ ! -e exp/tri2b_mc/final.mdl ]; then
-    echo "### TRAINING tri2b_mc ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$context_size --right-context=$context_size" \
-       2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc || exit 1;
 fi
 
-
-# tri2c (LDA-MLLT system) with multi-condition training, optimized parameters.
-# Disabled by default -- it only improves slightly, and tends to overfit.
-if [ ! -e exp/tri2c_mc/final.mdl ]; then
-    echo "### TRAINING tri2c_mc ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$mct_lda_left_context --right-context=$mct_lda_right_context" \
-       $mct_nstates $mct_ngauss data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2c_mc || exit 1;
+if [ $stage -le 7 ]; then
+  # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe.
+  echo "### TRAINING tri2b ###"
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=$context_size --right-context=$context_size" \
+    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b
+
+  # tri2b (LDA-MLLT system) with multi-condition training, using default parameters.
+  echo "### TRAINING tri2b_mc ###"
+  steps/train_lda_mllt.sh  --cmd "$train_cmd"\
+    --splice-opts "--left-context=$context_size --right-context=$context_size" \
+    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc
 fi
 
-
 # Prepare tri2b* systems for decoding.
-for recog in tri2b tri2b_mc; do
-    for lm in $lms; do
-        graph=exp/$recog/graph_$lm
-        if [ ! -e "$graph" ]; then
-            echo "### MAKING GRAPH $graph ###"
-            utils/mkgraph.sh data/lang_test_$lm exp/$recog $graph || exit 1;
-        fi
-    done
-done
-
+if [ $stage -le 8 ]; then
+  echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###"
+  for recog in tri2b tri2b_mc; do
+    utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm &
+  done
+  wait
+fi
 
 # discriminative training on top of multi-condition systems
 # one could also add tri2b here to have a DT clean recognizer for reference
-for base_recog in tri2b_mc; do
-
-    bmmi_recog=${base_recog}_mmi_b0.1
-    echo "### DT $base_recog --> $bmmi_recog ###"
+if [ $stage -le 9 ]; then
+  base_recog=tri2b_mc
+  bmmi_recog=${base_recog}_mmi_b0.1
+  echo "### DT $base_recog --> $bmmi_recog ###"
+
+  # get alignments from base recognizer
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali
+
+  # get lattices from base recognizer
+  denlats_dir=${base_recog}_denlats
+  subsplit=`echo $nj_train \* 2 | bc`
+  # DT with multi-condition data ...
+  steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \
+    data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir
+
+  # boosted MMI training
+  steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \
+    data/REVERB_tr_cut/SimData_tr_for_1ch_A \
+    data/lang \
+    exp/${base_recog}_ali \
+    exp/$denlats_dir \
+    exp/$bmmi_recog
+  cp exp/$base_recog/ali.* exp/$bmmi_recog
+fi
 
-    # get alignments from base recognizer
-    if [ ! -e exp/${base_recog}_ali/ali.1.gz ]; then
-        steps/align_si.sh --nj $nj_train \
-          --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali || exit 1;
-    fi
+# decoding using various recognizers
+if [ $stage -le 10 ]; then
+  # put tri2b last since it takes longest due to the large mismatch.
+  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
+    # The graph from the ML directory is used in recipe
+    recog2=`echo $recog | sed s/_mmi.*//`
+    graph=exp/$recog2/graph_$lm
+
+    echo "### DECODING with $recog, noadapt, $lm ###"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+        $graph $dataset \
+        exp/$recog/decode_$decode_suff &
+    done
+    wait
+
+    echo " ## MBR RESCORING with $recog, noadapt ##"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      mkdir -p exp/$recog/decode_mbr_$decode_suff
+      cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff
+      local/score_mbr.sh --cmd "$decode_cmd" \
+	$dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff &
+    done
+    wait
 
-    # get lattices from base recognizer
-    denlats_dir=${base_recog}_denlats
-    subsplit=`echo $nj_train \* 2 | bc`
-    if [ ! -e exp/$denlats_dir/.done.1 ]; then
-        # DT with multi-condition data ...
-        steps/make_denlats.sh --sub-split $subsplit --nj $nj_train \
-          data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir || exit 1;
-    fi
+  done # loop recog
+fi
 
-    # boosted MMI training
-    if [ ! -e exp/$bmmi_recog/final.mdl ]; then
-        steps/train_mmi.sh --boost 0.1 \
-          data/REVERB_tr_cut/SimData_tr_for_1ch_A \
-          data/lang \
-          exp/${base_recog}_ali \
-          exp/$denlats_dir \
-          exp/$bmmi_recog  || exit 1;
-        cp exp/$base_recog/ali.* exp/$bmmi_recog
+# decoding using various recognizers with adaptation
+if [ $stage -le 11 ]; then
+  # put tri2b last since it takes longest due to the large mismatch.
+  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
+    # The graph from the ML directory is used in recipe
+    recog2=`echo $recog | sed s/_mmi.*//`
+    graph=exp/$recog2/graph_$lm
+
+    # set the adaptation data
+    if [[ "$recog" =~ _mc ]]; then
+      tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A
+    else
+      tr_dataset=si_tr
     fi
 
-done
-
-}
+    echo "### DECODING with $recog, basis_fmllr, $lm ###"
+    steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog
+    for dataset in data/REVERB_*{dt,et}/*; do
+      (
+	decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+        steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
+          $graph $dataset \
+          exp/$recog/decode_basis_fmllr_$decode_suff
+      ) &
+    done
+    wait
+
+    echo " ## MBR RESCORING with $recog, basis_fmllr ##"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff
+      cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff
+      local/score_mbr.sh --cmd "$decode_cmd" \
+        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff &
+    done
+    wait
 
-# decoding using bigram / trigram and various recognizers
-do_adapt=true
-for lm in $lms; do
-    if [[ "$lm" =~ tg ]]; then
-        nj=$nj_tg
-    else
-        nj=$nj_bg
-    fi
-    # put tri2b last since it takes longest due to the large mismatch.
-    for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
-        # The graph from the ML directory is used in recipe
-        recog2=`echo $recog | sed s/_mmi.*//`
-        graph=exp/$recog2/graph_$lm
-        for dataset in data/REVERB_dt/SimData_dt* \
-                       data/REVERB_et/SimData_et* \
-                       data/REVERB_Real_dt/RealData_dt* \
-                       data/REVERB_Real_et/RealData_et*; do
-            if [[ $dataset =~ _dt ]]; then
-                pdataset=REVERB_dt
-            elif [[ $dataset =~ _et ]]; then
-                pdataset=REVERB_et
-            else
-                echo "$0: Cannot figure out what to do with: $dataset"
-                exit 1
-            fi
-            #pdataset=$(basename $(dirname $dataset))
-            #echo $pdataset
-            decode_suff=${lm}_${pdataset}_`basename $dataset`
-            if [ ! -e exp/$recog/decode_$decode_suff/wer_15 ]; then
-                echo "### DECODING $dataset | $recog, noadapt, $lm ###"
-                steps/decode.sh --nj $nj \
-                    $graph $dataset \
-                    exp/$recog/decode_$decode_suff || exit 1;
-            fi
-            if [ ! -e exp/$recog/decode_mbr_$decode_suff/wer_15 ]; then
-                mkdir -p exp/$recog/decode_mbr_$decode_suff
-                cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff 
-                echo " ## MBR RESCORING $dataset | $recog, noadapt ##"
-                local/score_mbr.sh \
-                    $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff || exit 1
-            fi
-            if $do_adapt; then
-                if [ ! -e exp/$recog/fmllr.basis ]; then
-                    if [[ "$recog" =~ _mc ]]; then
-                        tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A
-                    else
-                        tr_dataset=si_tr
-                    fi
-                    steps/get_fmllr_basis.sh --per-utt true data/$tr_dataset data/lang exp/$recog || exit 1;
-                fi
-                if [ ! -e exp/$recog/decode_basis_fmllr_$decode_suff/wer_15 ]; then
-                    echo "### DECODING $dataset | $recog, basis_fmllr, $lm ###"
-                    steps/decode_basis_fmllr.sh --nj $nj \
-                        $graph $dataset \
-                        exp/$recog/decode_basis_fmllr_$decode_suff || exit 1;
-                fi
-                if [ ! -e exp/$recog/decode_mbr_basis_fmllr_$decode_suff/wer_15 ]; then
-                    mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff
-                    cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff 
-                    echo " ## MBR RESCORING $dataset | $recog, basis_fmllr ##"
-                    local/score_mbr.sh \
-                        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff || exit 1
-                fi
-            fi
-
-        done # loop data set
-    done # loop recog
-done # loop LM
+  done # loop recog
+fi
 
 # get all WERs with lmw=15
-local/get_results.sh
+if [ $stage -le 12 ]; then
+  local/get_results.sh
+fi
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index b515804cfc2..1014fce03ed 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -229,6 +229,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 %WER 7.33 [ 919 / 12533, 80 ins, 153 del, 686 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch3/wer_13
 %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13
 
+### chain results ###
+# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh)
+%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5
 
 ### nnet1 results ###
 
diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh
index 4478796305e..6e2f3e9ee48 100644
--- a/egs/rm/s5/cmd.sh
+++ b/egs/rm/s5/cmd.sh
@@ -1,30 +1,31 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
+export train_cmd=queue.pl
+export decode_cmd=queue.pl
+export mkgraph_cmd=queue.pl
+export cuda_cmd="queue.pl --gpu 1"
 
-# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but
-# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not
-# used and we append options to train_cmd.
-cuda_cmd="queue.pl -l arch=*64 -l gpu=1"
-
-#train_cmd="run.pl"
-# with run.pl we do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
 
+# The rest of this file is here for historical reasons.  For cluster-specific
+# configuration it's generally better to use conf/queue.conf, see
+# http://kaldi-asr.org/doc/queue.html.
 
 # BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh
new file mode 100644
index 00000000000..0379d16fe13
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+# this script is a modified version of swbd/run_tdnn_5f.sh
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_5f
+
+# training options
+num_epochs=12
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_2y_tree
+lang=data/lang_chain_2y
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --leftmost-questions-truncate $leftmost_questions_truncate \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 200  --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1000000 \
+    --lm-opts "--num-extra-lm-states=200" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet2_online/ivectors \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/train $treedir exp/tri3b_lats $dir  || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context 20 --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test $dir/decode || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context 20 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh
index 43867ccf0d9..71dd849a93b 100644
--- a/egs/sprakbanken/s5/cmd.sh
+++ b/egs/sprakbanken/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G"
-#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G"
-#export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/sre08/v1/cmd.sh
+++ b/egs/sre08/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100755
--- a/egs/sre10/v1/cmd.sh
+++ b/egs/sre10/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh
index 4abf8546b0d..bae7f5cdf45 100644
--- a/egs/swbd/s5/cmd.sh
+++ b/egs/swbd/s5/cmd.sh
@@ -1,28 +1,16 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh
index 4abf8546b0d..575407ac0ff 100644
--- a/egs/swbd/s5b/cmd.sh
+++ b/egs/swbd/s5b/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS
index bba9b4cbfdd..7c2e22888d9 100644
--- a/egs/swbd/s5c/RESULTS
+++ b/egs/swbd/s5c/RESULTS
@@ -105,6 +105,9 @@ exit 0
 %WER 14.5 | 1831 21395 | 86.8 8.5 4.6 1.3 14.5 52.4 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg/score_12/eval2000_hires.ctm.swbd.filt.sys
 %WER 14.8 | 1831 21395 | 86.7 9.0 4.3 1.6 14.8 52.8 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg_per_utt/score_10/eval2000_hires.ctm.swbd.filt.sys
 
+
+(
+# old results with 25 million parameter model. We do not want to use such a big model. So see the new results below
 # local/nnet3/run_lstm.sh
 # these are results with nnet3 LSTMs cell_dim=1280, recurrent_dim=384, lstm_delay=-1 -2 -3, label_delay=5 num_params=25010228 (8 epoch training on speed-perturbed
 # and volume perturbed data)
@@ -114,6 +117,21 @@ exit 0
 %WER 18.1 | 4459 42989 | 84.0 11.2 4.8 2.0 18.1 54.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
 %WER 22.0 | 2628 21594 | 80.5 13.9 5.6 2.5 22.0 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
 %WER 23.3 | 2628 21594 | 79.4 14.7 6.0 2.7 23.3 59.2 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+)
+
+
+# local/nnet3/run_lstm.sh
+# these are results with nnet3 LSTMs cell_dim=1024, recurrent_dim=256, nonrecurrent_projection_dim=256, lstm_delay=-1 -2 -3, label_delay=5 num_params=14.6M (8 epoch training on speed-perturbed
+# this setup has the newly introduced feature self-repair, in addition to shrink
+%WER 11.6 | 1831 21395 | 89.7 6.9 3.4 1.3 11.6 46.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 12.6 | 1831 21395 | 88.7 7.6 3.7 1.4 12.6 49.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 21.3 | 2628 21594 | 81.0 13.2 5.8 2.4 21.3 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 23.1 | 2628 21594 | 79.5 14.7 5.8 2.6 23.1 59.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 16.5 | 4459 42989 | 85.3 10.1 4.6 1.8 16.5 53.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 17.9 | 4459 42989 | 84.1 11.2 4.8 2.0 17.9 55.5 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 15.17 [ 7466 / 49204, 993 ins, 1937 del, 4536 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+%WER 16.12 [ 7931 / 49204, 1072 ins, 1910 del, 4949 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_tg/wer_11_0.0
+
 
 # bidirectional LSTM
 # -----------------------
@@ -142,7 +160,11 @@ exit 0
 %WER 11.3 | 1831 21395 | 90.0 6.8 3.2 1.3 11.3 46.6 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 %WER 13.0 | 1831 21395 | 88.6 7.9 3.6 1.6 13.0 50.4 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
 
-
+# current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh)
+%WER 10.5 | 1831 21395 | 90.8 6.4 2.9 1.3 10.5 44.3 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
 
 # these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh
 %WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh
index 3f7de21e279..a14090a74a1 100644
--- a/egs/swbd/s5c/cmd.sh
+++ b/egs/swbd/s5c/cmd.sh
@@ -1,24 +1,29 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-# Default opts,
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-export cuda_cmd=run.pl # Run on local machine,
-export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+export cuda_cmd="queue.pl --gpu 1"
 
-# BUT options,
+
+# the rest of this file is present for historical reasons.  it's better to
+# create and edit conf/queue.conf for cluster-specific configuration.
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25"
   export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
+  export cuda_cmd="queue.pl -q long.q -l gpu=1"
 fi 
 
diff --git a/egs/swbd/s5c/conf/mfcc_dbl3.conf b/egs/swbd/s5c/conf/mfcc_dbl3.conf
new file mode 100644
index 00000000000..f0e09186f3e
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_dbl3.conf
@@ -0,0 +1,16 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=10     # for the higher-frequency-resolution mfcc coefficients, we'll use
+                  # a larger window size of 25ms and the normal window.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=17 # shorter than normal (25ms) frame length.... the shortest we can
+                  # go without the FFT becoming lower resolution which might cause
+                  # problems
+--window-type=hanning # additionally making the context shorter by using a more aggressively tapering window.
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf
new file mode 100644
index 00000000000..c41b76116ee
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf
@@ -0,0 +1,12 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=20 # slightly less than the normal 25ms frame length.
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf
new file mode 100644
index 00000000000..92670e7ed6e
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf
@@ -0,0 +1,11 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hiresf.conf b/egs/swbd/s5c/conf/mfcc_hiresf.conf
new file mode 100644
index 00000000000..c0b1798a9c5
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hiresf.conf
@@ -0,0 +1,12 @@
+# this is a config for 'fast' (7.5ms frame shift) high-resolution MFCC features,
+# intended for use with chain models.  Note: we keep all cepstra, so it has the
+# same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=25 # the normal frame length
+--frame-shift=7.5
diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt
new file mode 100644
index 00000000000..8e347f4f889
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/README.txt
@@ -0,0 +1,29 @@
+
+there are a lot of tuning experiments here.
+
+ones to look at right now:
+  2y is a TDNN baseline
+  4f is a good jesus-layer system
+  4q is an improved TDNN with various bells and whistles from Vijay.
+  4r is a slightly-better jesus-layer system than 4f, with one more layer.
+  5e is the best configuration run so far that doesn't have statistics-averaging layers.
+  5g uses a statistics-averaging layer in the middle to slightly improve on 5e (by about
+     0.2%).
+  5j is a basic configuration without iVectors (about 2% abs worse than 5e)
+  5k is the best configurations without iVectors... about 1% abs worse than 5e; we
+     use statistics-averaging layers to do some crude adaptation.
+  5t gives about the same performance as 5e but is about 30% faster to train
+     and is smaller.
+  5v is what I am currently using as a baseline- it has an even smaller
+     --jesus-hidden-dim as 5t (hence faster to train), but gives the same
+     performance.
+  6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component)
+     and slightly more parameters, which is quicker to train than 5v but gives
+     about the same results.  I'm hoping to use this setup, going forward.
+  6i is like 6i but with a separate last-but-one affine layer for the xent output
+     (marginally better than 6g).
+  6z is probably the thing I currently recommend to run-- it's a TDNN+ReLU based
+     setup that's quite fast to train and gives better results than our old
+     jesus-layer-based system.
+
+
diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ded03563711
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" $x;   done
+echo
+
+echo -n "WER on train_dev(tg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on train_dev(fg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(tg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(fg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/run_blstm_6h.sh
new file mode 100755
index 00000000000..b19a0b489a0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm_6h.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+# based on run_tdnn_6h.sh
+
+#%WER 9.6 | 1831 21395 | 91.6 5.8 2.6 1.2 9.6 44.2 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 14.5 | 4459 42989 | 87.4 8.9 3.7 1.9 14.5 50.5 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+#%WER 19.3 | 2628 21594 | 83.3 11.8 4.9 2.5 19.3 54.8 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 13.32 [ 6554 / 49204, 830 ins, 1696 del, 4028 sub ] exp/chain/blstm_6h_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6h  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+
+label_delay=0
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" [-3,3] [-3,3] [-3,3] " \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 1024 \
+    --hidden-dim 1024 \
+    --recurrent-projection-dim 256 \
+    --non-recurrent-projection-dim 256 \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/run_lstm_6h.sh
new file mode 100755
index 00000000000..feb72aee726
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_lstm_6h.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+
+# based on run_tdnn_6h.sh
+
+# %WER 15.6 | 4459 42989 | 86.1 9.2 4.7 1.8 15.6 52.1 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# %WER 10.3 | 1831 21395 | 90.9 6.1 3.0 1.3 10.3 44.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 20.7 | 2628 21594 | 82.0 12.8 5.3 2.7 20.7 56.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+
+# if right-tolerance was 10 (these are old results)
+#---------------------------
+# %WER 15.8 | 4459 42989 | 86.0 9.3 4.8 1.8 15.8 52.0 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 10.6 | 1831 21395 | 90.6 6.2 3.2 1.2 10.6 45.2 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 21.0 | 2628 21594 | 81.4 12.4 6.3 2.4 21.0 56.8 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6h  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+
+label_delay=5
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" -3 -3 -3 " \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 1024 \
+    --hidden-dim 1024 \
+    --recurrent-projection-dim 256 \
+    --non-recurrent-projection-dim 256 \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_lstm_d.sh b/egs/swbd/s5c/local/chain/run_lstm_d.sh
new file mode 100755
index 00000000000..05db63c2bee
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_lstm_d.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# based on run_tdnn_2o.sh
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_d  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -3 -3 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  # note an additional space is added to splice_indexes to
+  # avoid issues with the python ArgParser which can have
+  # issues with negative arguments (due to minus sign)
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --splice-indexes "$splice_indexes " \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
index 2e08d5e22af..a8552244ed2 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
@@ -276,4 +276,4 @@ b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_s
 b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh ; done
 %WER 16.57 [ 8155 / 49204, 1144 ins, 1988 del, 5023 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
 %WER 16.83 [ 8282 / 49204, 1106 ins, 2115 del, 5061 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
-%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
\ No newline at end of file
+%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
index eaa5a77949f..218890cc418 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
@@ -1,69 +1,10 @@
 #!/bin/bash
-
-# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
-# the log-like change when deciding which states to back off.  The code is not the same
-# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
-# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
-# is quite similar to 2d, except new/more-exact code is used.
-
-# see table in run_tdnn_2a.sh for results
-
-# _2d is as _2c but with different LM options:
-# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
-# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
-# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
-# provided from the tree-building, and effectively puts the leftmost context position as a single
-# set.
-#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
-#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
-
-# _2c is as _2a but after a code change in which we start using transition-scale
-# and self-loop-scale of 1 instead of zero in training; we change the options to
-# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
-# results at all; it's is mainly for convenience in pushing weights in graphs,
-# and checking that graphs are stochastic.
-
-# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
-
-# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
-# (see also y, which has --num-extra-states=500).
-
-# _x is as _s but setting  --lm-opts "--num-extra-states=0".
-#  this is a kind of repeat of the u->v experiment, where it seemed to make things
-#  worse, but there were other factors involved in that so I want to be sure.
-
-# _s is as _q but setting pdf-boundary-penalty to 0.0
-# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
-# and 18.07 -> 16.96 on train_dev, after fg rescoring.
-
-# _q is as _p except making the same change as from n->o, which
-# reduces the parameters to try to reduce over-training.  We reduce
-# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
-# and modify the splicing setup.
-# note: I don't rerun the tree-building, I just use the '5o' treedir.
-
-# _p is as _m except with a code change in which we switch to a different, more
-# exact mechanism to deal with the edges of the egs, and correspondingly
-# different script options... we now dump weights with the egs, and apply the
-# weights to the derivative w.r.t. the output instead of using the
-# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
-# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
-# ramping up to a weight of 1.0 over 10 frames.
-
-# _m is as _k but after a code change that makes the denominator FST more
-# compact.  I am rerunning in order to verify that the WER is not changed (since
-# it's possible in principle that due to edge effects related to weight-pushing,
-# the results could be a bit different).
-#  The results are inconsistently different but broadly the same.  On all of eval2000,
-#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
-#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
-
-
-# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
-# option and setting max-param-change to 1..  Using the same egs.
-
+# _2i is as _i but it uses speaker perturbation combined with speed perturbation.
 # _i is as _h but longer egs: 150 frames instead of 75, and
 # 128 elements per minibatch instead of 256.
+# be cautious comparing the valid probs with h though, because
+# we fixed the utt2uniq bug at this point, so from h on, the valid probs
+# are properly held out.
 
 # _h is as _g but different application of max-param-change (use --scale-max-param-change true)
 
@@ -93,21 +34,23 @@
 set -e
 
 # configs for 'chain'
-stage=12
+stage=1
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
+speaker_perturb=true
 dir=exp/chain/tdnn_2i  # Note: _sp will get added to this if $speed_perturb == true.
 
 # TDNN options
-splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
 
 # training options
 num_epochs=4
 initial_effective_lrate=0.001
 final_effective_lrate=0.0001
 leftmost_questions_truncate=30
-max_param_change=1.0
+max_param_change=0.3333
+scale_max_param_change=true
 final_layer_normalize_target=0.5
 num_jobs_initial=3
 num_jobs_final=16
@@ -138,16 +81,19 @@ suffix=
 if [ "$speed_perturb" == "true" ]; then
   suffix=_sp
 fi
+if [ "$speaker_perturb" == "true" ]; then
+  suffix=$suffix"_fp"
+fi
 
 dir=${dir}$suffix
 train_set=train_nodup$suffix
 ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5o_tree$suffix
+treedir=exp/chain/tri5f_tree$suffix
 
 # if we are using the speed-perturbed data we need to generate
 # alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
+local/nnet3/run_ivector_common_2.sh --stage $stage \
+  --speed-perturb $speed_perturb --speaker-perturb $speaker_perturb \
   --generate-alignments $speed_perturb || exit 1;
 
 
@@ -161,6 +107,7 @@ if [ $stage -le 9 ]; then
 fi
 
 
+if false; then #100
 if [ $stage -le 10 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
   # topo file. [note, it really has two states.. the first one is only repeated
@@ -179,23 +126,23 @@ if [ $stage -le 11 ]; then
   # Build a tree using our new topology.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
       --leftmost-questions-truncate $leftmost_questions_truncate \
-      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
 fi
+fi #100
 
 if [ $stage -le 12 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
  touch $dir/egs/.nodelete # keep egs around when that run dies.
 
  steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
-    --pdf-boundary-penalty 0.0 \
-    --lm-opts "--num-extra-lm-states=2000" \
     --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
     --minibatch-size $minibatch_size \
-    --egs-opts "--frames-overlap-per-eg 30" \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
     --frames-per-eg $frames_per_eg \
     --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
     --splice-indexes "$splice_indexes" \
@@ -205,7 +152,7 @@ if [ $stage -le 12 ]; then
     --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
     --max-param-change $max_param_change \
     --final-layer-normalize-target $final_layer_normalize_target \
-    --relu-dim 850 \
+    --relu-dim 1024 \
     --cmd "$decode_cmd" \
     --remove-egs $remove_egs \
     data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
@@ -215,7 +162,8 @@ if [ $stage -le 13 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
 decode_suff=sw1_tg
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
index 4c0ac7e62ca..d17ebdf9be7 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
@@ -301,4 +301,4 @@ LOG (lattice-best-path:main():lattice-best-path.cc:99) For utterance sp1.0-sw028
 LOG (lattice-best-path:main():lattice-best-path.cc:124) Overall score per frame is 46.9461 = 0.0637047 [graph] + 46.8824 [acoustic] over 843 frames.
 LOG (lattice-best-path:main():lattice-best-path.cc:128) Done 1 lattices, failed for 0
 LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances.
-sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
\ No newline at end of file
+sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
new file mode 100755
index 00000000000..4f350891e8a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# _3c is as _2y, but using 'jesus' nonlinearity: the --jesus-dim 800 option, instead of
+#   --relu-dim 850.
+#  reusing the egs from 2y.
+# caution: see config section, I changed some things while running.
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3c  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+# max_param_change=1.0
+max_param_change=0.5  # Changed it to this value on iteration  74.
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64  # switched to 64 on iteration 7 after a failure.
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --jesus-dim 800 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
new file mode 100755
index 00000000000..ca8080db080
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+# (note: cannot be reproduced using current scripts).
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# Results are about the same as 2y, or maybe just a little worse.
+
+# a03:s5c: ./show_wer.sh 3d
+# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
new file mode 100755
index 00000000000..af5661b8c85
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+# (note: cannot be reproduced using current scripts).
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3e  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
new file mode 100755
index 00000000000..f33459f5f08
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# (note: cannot be reproduced using current scripts).
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3f  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
new file mode 100755
index 00000000000..ff1e539306f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
@@ -0,0 +1,303 @@
+#!/bin/bash
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# (note: cannot be reproduced using current scripts).
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3g  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
new file mode 100755
index 00000000000..f0e9efc2ac4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3h  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
new file mode 100755
index 00000000000..876048b5852
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor)
+
+# Here is the original decoding, with frame-per-chunk=50
+#./show_wer.sh 3i
+#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# and a newer decoding with frames-per-chunk=100.
+# ./show_wer.sh 3i
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# after initial decoding wasn't great, trying increasing frames-per-chunk from
+# 50 to 100.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3i  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 100 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
new file mode 100755
index 00000000000..faef84e8879
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3j  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
new file mode 100755
index 00000000000..b869c7b2553
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
new file mode 100755
index 00000000000..7a016ed2197
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+
+# 3k2 is as 3k, but dumping the egs with --extra-left-context 20.
+# Also there will have been some script changes in the meantime,
+# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up
+# iters.
+
+# log-probs are better than 3k and in fact better than any experiment so far:
+# valid -0.115->-0.107, and train -0.077 to -0.074.
+
+# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys
+
+#... and here is the WER after changing it to 150, still with --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse):
+#./show_wer.sh 3k2
+#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20.
+#./show_wer.sh 3k2
+#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100.
+# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging
+# deletions at that point, for reasons that relate to edge effects in the objective function.
+#./show_wer.sh 3k2
+#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k2  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --extra-left-context 20 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context-initial 20 \
+         --extra-left-context 50 \
+         --frames-per-chunk 100 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
new file mode 100755
index 00000000000..608e437659e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+
+# [abandoned, not working well.]
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3l  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
new file mode 100755
index 00000000000..b25f9f15130
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# [note: this uses BlockAffineComponent not RepeatedAffineComponent]
+# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers
+# were learning too slowly in 3l (this will make them learn approximately 4x faster).
+# [terminated, likelihoods were not promising].
+
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3m  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
new file mode 100755
index 00000000000..dedbd84be75
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
@@ -0,0 +1,305 @@
+#!/bin/bash
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3n  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
new file mode 100755
index 00000000000..14383fe1a32
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+# [ seemed helpful based on likelihoods on first iterations]: on iter 42,
+# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3o  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
new file mode 100755
index 00000000000..ddba7e7f9c5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# Comparing the WER with 2y, it's about 1% abs worse [see below].  However, this is
+# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million
+# parameters), seems to have a lot more learning capacity, with better train and worse valid
+# prob.  In 3r and 3s I am trying smaller versions of this architecture.
+
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#                        2y             3p
+#  final-train-prob:  -0.083068    -0.0771
+#  final-valid-prob:  -0.01212     -0.12715
+# num-parameters:      12094115     8804087
+
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3p  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
new file mode 100755
index 00000000000..9f67164b806
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+
+# _3q is as _3p, but now trying out the 'block' training script, where in addition to
+# the affine connections we have block-matrix connections between the layers.
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3q  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900  --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
new file mode 100755
index 00000000000..7815adffb9f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+# [I think I abandoned this after deciding to reduce the parameters even further,
+# to the setup in 3s].
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3r  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
new file mode 100755
index 00000000000..6cee8b11925
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3s  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
new file mode 100755
index 00000000000..25e30900e36
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
@@ -0,0 +1,336 @@
+#!/bin/bash
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+#  The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243.
+# WER is slightly worse.  So we won't use this for now, but later if we use more data we
+# could try wider context like this.
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#
+#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3t  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
new file mode 100755
index 00000000000..d1b93d9084c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim
+# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to
+# ensure that the number of parameters doesn't increase too much.
+#  [stopping this run, as the likelihoods weren't promising, e.g. by iteration
+#  39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532)
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3u  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
new file mode 100755
index 00000000000..c7fcb7e24f5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
@@ -0,0 +1,328 @@
+#!/bin/bash
+
+# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50.
+# I stopped it early after likelihoods were not promising:
+#  on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3v  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
new file mode 100755
index 00000000000..e4165e54de6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
@@ -0,0 +1,332 @@
+#!/bin/bash
+
+# _3w is as _3t but instead of having a rectangular affine component in each
+# layer, making it square (700->600 not 1300->400), and introducing a new script
+# option --final-hidden-dim to have something like a bottleneck at the last
+# layer, to avoid a blowup in parameters.
+#  (note: num-params was slightly smaller, 4.8 million vs 5.3
+#  I stopped this on iter 65 after likelihoods were not promising:
+# on iter 63, train -0.133->-0.138, valid -0.138->-0.141.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3w  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
new file mode 100755
index 00000000000..1585d209a93
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+
+# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)...
+#  increasing --jesus-forward-output-dim from 1500 to 2000.
+# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261,
+# WER effect is very tiny but maybe slightly better.
+#a03:s5c: ./show_wer.sh 3x
+#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3x  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
new file mode 100755
index 00000000000..042ec84898b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000.
+#  not promising: by iteration 228, train prob changed -0.09583->-0.09575, and
+# valid prob from -0.1213 -> -0.1239.  Killed it.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 3s.
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3y  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
new file mode 100755
index 00000000000..f1fa2c5a45e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
@@ -0,0 +1,350 @@
+#!/bin/bash
+
+# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k.
+# A slight degradation in WER, but it's not 100% consistent.  The final train-prob
+# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280.
+#./show_wer.sh 3z
+#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3z  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_3z_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
new file mode 100755
index 00000000000..c02ad2cb0e4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+# WER is maybe a fraction worse than 3s (see below); final train prob is
+# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213
+#./show_wer.sh 4a
+#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4a  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
new file mode 100755
index 00000000000..aad278c3037
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing)
+#  stopped early after train and valid likelihoods were not promising.
+# [later accidentally overwrote and moved the dir.]
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4b  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
new file mode 100755
index 00000000000..d9060251844
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
@@ -0,0 +1,357 @@
+#!/bin/bash
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
new file mode 100755
index 00000000000..1ae220dc21a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10
+# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to
+# activate the new-style derivative weights.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights true \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
new file mode 100755
index 00000000000..fea5495ee06
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
@@ -0,0 +1,362 @@
+#!/bin/bash
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+# big improvement- about 0.7% WER abs.  Considering the non-l2 part of the objf, the
+# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196].
+# and for the training st it's -0.08820 -> -0.1149.
+
+
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4c
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.0001 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
new file mode 100755
index 00000000000..36d5f188c56
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
@@ -0,0 +1,366 @@
+#!/bin/bash
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
new file mode 100755
index 00000000000..430c6c28c70
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
@@ -0,0 +1,365 @@
+#!/bin/bash
+
+# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000.
+# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse,
+# the train and valid probs both get better.
+
+#                      4a     4c      4g
+#  Final train prob: -0.0879  -0.08820  -0.08784
+#  Final valid prob: -0.1214  -0.1241   -0.1204
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
new file mode 100644
index 00000000000..9125d4e7967
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
@@ -0,0 +1,386 @@
+#!/bin/bash
+
+# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100.
+#  reusing iter 100 of model 4f to avoid some iterations of training [did this by
+# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with
+# --iter 100].
+# [note: to get the block-affine stuff to train fast enough to make a difference
+#  I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in
+#  the code.  That change is not committed.]
+#
+# Essentially no effect on WER, but train and valid probs are worse.
+# ./compare_wer.sh 4f 4n
+# System                       4f        4n
+# WER on train_dev(tg)      16.83     16.84
+# WER on train_dev(fg)      15.73     15.69
+# WER on eval2000(tg)        18.4      18.4
+# WER on eval2000(fg)        16.6      16.6
+# Final train prob      -0.105832 -0.111309
+# Final valid prob      -0.123021 -0.123601
+
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --convert-repeated-to-block-iter 100 \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
new file mode 100755
index 00000000000..d2b073cdc77
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
@@ -0,0 +1,381 @@
+#!/bin/bash
+
+# _4p is as _4f, but one fewer layer, and making the final-layer context wider to
+# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to
+# somewhat compensate for the reduction in parameters.
+
+# definitely worse.  Later with 4r I go in the opposite direction by adding a new layer,
+# and get a small improvement.
+# ./compare_wer.sh 4f 4p
+# System                       4f        4p
+# WER on train_dev(tg)      16.83     17.36
+# WER on train_dev(fg)      15.73     16.10
+# WER on eval2000(tg)        18.4      19.1
+# WER on eval2000(fg)        16.6      17.2
+# Final train prob      -0.105832 -0.104439
+# Final valid prob      -0.123021 -0.125576
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 450  --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
new file mode 100755
index 00000000000..9f2534f4f22
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# this is based on Dan's tdnn_2o script
+# it has a different splicing configuration
+# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer
+
+set -e
+
+#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4q  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=7 
+pool_type='per-dim-weighted-average'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
new file mode 100755
index 00000000000..64831b5802a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
new file mode 100755
index 00000000000..92a1a7da277
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option-
+#currently in a branch]
+# Overall no real change.
+
+# ./compare_wer.sh 4f 4s
+# System                       4f        4s
+# WER on train_dev(tg)      16.83     16.82
+# WER on train_dev(fg)      15.73     15.62
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.111371
+# Final valid prob      -0.123021  -0.12648
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.02 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
new file mode 100755
index 00000000000..30b383d05d7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# [note, I accidentally overwrote this directory afterwards, and moved it.]
+# It's really not clear whether it's helpful.
+# ./compare_wer.sh 4f 4t
+# System                       4f        4t
+# WER on train_dev(tg)      16.83     16.75
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.112721
+# Final valid prob      -0.123021 -0.129688
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
new file mode 100755
index 00000000000..ae7cf02b426
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
@@ -0,0 +1,384 @@
+#!/bin/bash
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+# It seems a bit better on average.
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
new file mode 100755
index 00000000000..9cdbfefb5a2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
@@ -0,0 +1,394 @@
+#!/bin/bash
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+#./compare_wer.sh 4r 4v
+#System                       4r        4v
+#WER on train_dev(tg)      16.50     15.95
+#WER on train_dev(fg)      15.45     14.69
+#WER on eval2000(tg)        18.3      17.7
+#WER on eval2000(fg)        16.7      16.0
+#Final train prob      -0.103652 -0.106646  -1.60775
+#Final valid prob      -0.121105 -0.118631  -1.62832
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
new file mode 100755
index 00000000000..6dd5c587f7a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
@@ -0,0 +1,397 @@
+#!/bin/bash
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a
+# bit worse, although final valid prob is very slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
new file mode 100755
index 00000000000..0290e0bdbd5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
@@ -0,0 +1,396 @@
+#!/bin/bash
+
+# _4x is as _4u, but with --leaky-hmm-coefficient 0.2.   Note: the
+# ultimate baseline is 4f.  It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1).
+# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1.
+#
+# ./compare_wer.sh  4f 4u 4x
+# System                       4f        4u        4x
+# WER on train_dev(tg)      16.83     16.47     16.63
+# WER on train_dev(fg)      15.73     15.23     15.42
+# WER on eval2000(tg)        18.4      18.4      18.4
+# WER on eval2000(fg)        16.6      16.7      16.6
+# Final train prob      -0.105832 -0.118911 -0.130674
+# Final valid prob      -0.123021 -0.135768 -0.146351
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
new file mode 100755
index 00000000000..cd1de07a80d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
@@ -0,0 +1,401 @@
+#!/bin/bash
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.  Very helpful (between 0.2%
+# and 0.6%).
+
+#./compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
new file mode 100755
index 00000000000..7e44c10920e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
@@ -0,0 +1,404 @@
+#!/bin/bash
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
new file mode 100755
index 00000000000..93ebb59b16d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be
+# worse than 0.1.
+# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2).
+#System                       4w        5c
+#WER on train_dev(tg)      16.05     16.35
+#WER on train_dev(fg)      14.92     15.21
+#WER on eval2000(tg)        18.0      17.8
+#WER on eval2000(fg)        16.2      16.4
+#Final train prob      -0.108816 -0.107098
+#Final valid prob      -0.118254 -0.118209
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2.  WER seems consistently
+# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very
+# slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.05 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
new file mode 100755
index 00000000000..8e6e9358003
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
@@ -0,0 +1,407 @@
+#!/bin/bash
+
+# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and
+# jesus-forward-output-dim from 1800 to 2000.
+
+# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1).
+#./compare_wer.sh 5b 5d
+#System                       5b        5d
+#WER on train_dev(tg)      15.51     15.29
+#WER on train_dev(fg)      14.39     14.17
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.7
+#Final train prob      -0.112013 -0.107858
+#Final valid prob      -0.130879 -0.128862
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
new file mode 100755
index 00000000000..ed48b0673b8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
@@ -0,0 +1,417 @@
+#!/bin/bash
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
new file mode 100755
index 00000000000..5fb1f0c445c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+
+# _5f is as _5e, but making the 5b->5d change (increasing the
+# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000,
+# and jesus-forward-input-dim from 500 to 600.
+
+# WER change is (-0.1, -0.2, +0.2, +0.1).  So zero on average.
+# This means 5e remains the best system so far.
+
+#./compare_wer.sh 5e 5f
+#System                       5e        5f
+#WER on train_dev(tg)      15.43     15.35
+#WER on train_dev(fg)      14.32     14.15
+#WER on eval2000(tg)        17.3      17.5
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056  -0.10574
+#Final valid prob      -0.129184 -0.128112
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1).
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh
new file mode 100755
index 00000000000..784facf5a82
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+
+# Here is decoding with --frames-per-chunk 300.  A fairly consistent
+# improvement.
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.27
+#WER on train_dev(fg)      14.32     14.21
+#WER on eval2000(tg)        17.3      16.9
+#WER on eval2000(fg)        15.5      15.2
+#Final train prob      -0.110056 -0.103752
+#Final valid prob      -0.129184 -0.125641
+
+
+#  *All results below here are broken-- they were computed when I had a bug in
+#   the index-permutation, and the blocks weren't computed right for the jesus
+#   layer.*
+# Here are WERs when the frames-per-chunk was 50:
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.62
+#WER on train_dev(fg)      14.32     14.42
+#WER on eval2000(tg)        17.3      17.7
+#WER on eval2000(fg)        15.5      16.0
+
+# and here with 150:
+# WER on train_dev(tg)      15.43     15.46
+# WER on train_dev(fg)      14.32     14.38
+# WER on eval2000(tg)        17.3      17.3
+# WER on eval2000(fg)        15.5      15.5
+
+
+# and here with 300 ... we do see a small improvement
+# at this value.  (could probably improve it further
+# by modifying the model to average over a larger window).
+#WER on train_dev(tg)      15.43     15.29
+#WER on train_dev(fg)      14.32     14.17
+#WER on eval2000(tg)        17.3      17.2
+#WER on eval2000(fg)        15.5      15.4
+#Final train prob      -0.110056 -0.105725
+#Final valid prob      -0.129184 -0.125756
+
+# Below is also with chunk-size=300, but with the 'wide' model
+# that sees more context.  Oddly, the WER is worse.  It looks like
+# the model may be doing something different than just learning
+# speaker characteristics.
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.54
+#WER on train_dev(fg)      14.32     14.34
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056 -0.105725
+#Final valid prob      -0.129184 -0.125756
+
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+# if [ $stage -le 15 ]; then
+#   # get wide-context model
+#   nnet3-am-copy --binary=false $dir/final.mdl - | \
+#     sed 's/Context> 99/Context> 306/g' | nnet3-am-copy - $dir/wide.mdl
+#   for decode_set in train_dev eval2000; do
+#       (
+#       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+#           --frames-per-chunk 300 --iter wide \
+#           --nj 50 --cmd "$decode_cmd" \
+#           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+#          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+#       if $has_fisher; then
+#           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+#             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+#             $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+#       fi
+#       ) &
+#   done
+# fi
+
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh
new file mode 100755
index 00000000000..5eeb5ca5d03
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh
@@ -0,0 +1,434 @@
+#!/bin/bash
+
+# _5h is as _5g, but only mean, no stddev, stats.
+
+# The following comparison is with 150 frames per chunk
+# in both the 5g and 5h decodes.  No consistent WER difference
+# with either 5e or 5g.
+#System                       5e        5g        5h
+#WER on train_dev(tg)      15.43     15.46     15.45
+#WER on train_dev(fg)      14.32     14.38     14.34
+#WER on eval2000(tg)        17.3      17.3      17.2
+#WER on eval2000(fg)        15.5      15.5      15.7
+#Final train prob      -0.110056 -0.105725 -0.106213
+#Final valid prob      -0.129184 -0.125756 -0.126809
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5h # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 150 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh
new file mode 100755
index 00000000000..9ffc37793ee
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh
@@ -0,0 +1,432 @@
+#!/bin/bash
+
+# _5i is as _5g, but adding the mean+stddev features for all hidden layers.
+# a little worse than 5g (but for Remi Francis it was a little better).
+#local/chain/compare_wer.sh 5e 5g 5i
+#System                       5e        5g        5i
+#WER on train_dev(tg)      15.43     15.27     15.41
+#WER on train_dev(fg)      14.32     14.21     14.47
+#WER on eval2000(tg)        17.3      16.9      17.0
+#WER on eval2000(fg)        15.5      15.2      15.4
+#Final train prob      -0.110056 -0.103752 -0.102539
+#Final valid prob      -0.129184 -0.125641  -0.12375
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5i # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2,mean+stddev(-99:1:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -6,-3,0,mean+stddev(-99:3:9:99)" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 150 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh
new file mode 100755
index 00000000000..892a79fd2a8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh
@@ -0,0 +1,427 @@
+#!/bin/bash
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5j # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh
new file mode 100755
index 00000000000..b6c984ed253
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh
@@ -0,0 +1,454 @@
+#!/bin/bash
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5k # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_5j_sp/egs \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh
new file mode 100755
index 00000000000..d5b51eb7551
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh
@@ -0,0 +1,464 @@
+#!/bin/bash
+
+# _5l is as _5k, but doubling frames-per-eg from 150 to 300, and increasing
+# the context radius of the statistics-pooling from 99 to 153.
+
+# :-( No better than 5k.)
+#./compare_wer.sh 5e 5j 5k 5l
+#System                       5e        5j        5k        5l
+#WER on train_dev(tg)      15.43     17.59     16.46     16.68
+#WER on train_dev(fg)      14.32     16.33     15.17     15.40
+#WER on eval2000(tg)        17.3      19.1      18.1      18.3
+#WER on eval2000(fg)        15.5      17.5      16.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502-0.0804455
+#Final valid prob      -0.129184 -0.130761  -0.12337  -0.10712
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5l # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.414  # was 2; now 2 / sqrt(2) = sqrt(2), since we're using half the minibatch size.
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --frames-per-eg 300 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-153:3:9:153) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size 64 \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh
new file mode 100644
index 00000000000..a9e12357c23
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh
@@ -0,0 +1,430 @@
+#!/bin/bash
+
+# _5m is as _5e, but with a script change where we are randomizing
+# the frame shift a bit better.
+
+# No very clear change, but if anything the optimization is less effective
+# and the WER worse -> I'm going to revert this script change.
+#System                       5e        5m
+#WER on train_dev(tg)      15.43     15.57
+#WER on train_dev(fg)      14.32     14.47
+#WER on eval2000(tg)        17.3      17.2
+#WER on eval2000(fg)        15.5      15.7
+#Final train prob      -0.110056 -0.112539
+#Final valid prob      -0.129184 -0.129006
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5m # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh
new file mode 100755
index 00000000000..d4372a418d8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh
@@ -0,0 +1,459 @@
+#!/bin/bash
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh
new file mode 100755
index 00000000000..86bbe1ad441
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh
@@ -0,0 +1,467 @@
+#!/bin/bash
+
+# _5o is as _5n but adding an extra splicing layer and increasing the
+# splice-width slightly on the 1st layer, to get closer to the context in 5n;
+# having one more layer running at double-frequency, and reverting the frame-length to
+# the same as in the baseline (25ms) to avoid sacrificing frequency resolution.
+
+# Objective functions improve but WER change is quite small vs 5n (~0.1%).  so
+# not clear that the extra time is worth it (it's noticeably slower to train as
+# that extra layer is at a higher sampling rate).
+#
+#System                       5j        5n        5o
+#WER on train_dev(tg)      17.59     16.85     16.83
+#WER on train_dev(fg)      16.33     15.67     15.60
+#WER on eval2000(tg)        19.1      19.1      18.8
+#WER on eval2000(fg)        17.5      17.3      17.2
+#Final train prob      -0.114691 -0.116341 -0.111613
+#Final valid prob      -0.130761 -0.130884 -0.126765
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5o # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \
+        data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl2  # remove segments with problems
+  done
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl2 $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires_dbl2 $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl2 \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh
new file mode 100755
index 00000000000..d2ef7057873
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh
@@ -0,0 +1,421 @@
+#!/bin/bash
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated. [abandoned after discovering bug,
+# this thread is picked up in 5s and 5t.]
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh
new file mode 100755
index 00000000000..5968a00417e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh
@@ -0,0 +1,425 @@
+#!/bin/bash
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.  [abandoned after discovering bug,
+# this thread is picked up in 5s and 5t.]
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5q # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh
new file mode 100755
index 00000000000..306d76859f9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh
@@ -0,0 +1,427 @@
+#!/bin/bash
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+# [abandoned after discovering bug, this thread is picked up in 5s and 5t.]
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh
new file mode 100755
index 00000000000..65da1e06183
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh
@@ -0,0 +1,441 @@
+#!/bin/bash
+
+# Comparing with 5e which is the most recent baseline we actually decoded,
+# 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and
+# and the new option --self-repair-scale 0.00001 added.
+# Also compare 5t and 5v which have even smaller j3sus-hidden-dims.
+
+#./compare_wer.sh 5e 5s 5t
+#System                       5e        5s        5t
+#WER on train_dev(tg)      15.43     15.47     15.43
+#WER on train_dev(fg)      14.32     14.31     14.34
+#WER on eval2000(tg)        17.3      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6
+#Final train prob      -0.110056 -0.110928 -0.110752
+#Final valid prob      -0.129184 -0.132139 -0.129123
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh
new file mode 100755
index 00000000000..9831417003b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh
@@ -0,0 +1,445 @@
+#!/bin/bash
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.  Seems to make no difference to WERs; valid prob improves.
+
+#local/chain/compare_wer.sh 5e 5s 5t
+#System                       5e        5s        5t
+#WER on train_dev(tg)      15.43     15.47     15.43
+#WER on train_dev(fg)      14.32     14.31     14.34
+#WER on eval2000(tg)        17.3      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6
+#Final train prob      -0.110056 -0.110928 -0.110752
+#Final valid prob      -0.129184 -0.132139 -0.129123
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh
new file mode 100755
index 00000000000..34fe30993cf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh
@@ -0,0 +1,505 @@
+#!/bin/bash
+
+# _5u is as _5o but modifying the mfcc generation to use a narrower window while
+# generating the lower-order mfcc coefficients (the first 10).
+
+# Abandoning it partway through after I got the following less-than-promising diagnostics.
+# grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_valid.84.log | grep -v xent
+# exp/chain/tdnn_5o_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146977 + -0.0159528 = -0.16293 per frame, over 20000 frames.
+# exp/chain/tdnn_5u_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.147207 + -0.015692 = -0.162899 per frame, over 20000 frames.
+# a03:s5c: grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_train.84.log | grep -v xent
+# exp/chain/tdnn_5o_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146703 + -0.0165036 = -0.163207 per frame, over 20000 frames.
+# exp/chain/tdnn_5u_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.145524 + -0.0162272 = -0.161751 per frame, over 20000 frames.
+
+# _5o is as _5n but adding an extra splicing layer and increasing the
+# splice-width slightly on the 1st layer, to get closer to the context in 5n;
+# having one more layer running at double-frequency, and reverting the frame-length to
+# the same as in the baseline (25ms) to avoid sacrificing frequency resolution.
+
+# Objective functions improve but WER change is quite small vs 5n (~0.1%).  so
+# not clear that the extra time is worth it (it's noticeably slower to train as
+# that extra layer is at a higher sampling rate).
+#
+#System                       5j        5n        5o
+#WER on train_dev(tg)      17.59     16.85     16.83
+#WER on train_dev(fg)      16.33     15.67     15.60
+#WER on eval2000(tg)        19.1      19.1      18.8
+#WER on eval2000(fg)        17.5      17.3      17.2
+#Final train prob      -0.114691 -0.116341 -0.111613
+#Final valid prob      -0.130761 -0.130884 -0.126765
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=13
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data with normal window size.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \
+        data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl2  # remove segments with problems
+  done
+fi
+
+# Generate double-frame-rate version of the data with smaller than normal window size;
+# and only keeping the first 10 MFCC coefficients.
+if [ $stage -le 13 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_dbl3
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_dbl3.conf \
+        data/${dataset}_dbl3 exp/make_dbl3/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_dbl3  # remove segments with problems
+  done
+fi
+
+# select dimension 10-39 of the dbl2 features, then create pasted features consisting
+# of the 10 dimensions of the dbl3, plus the selected dimensions 10-39 of dbl2.
+if [ $stage -le 14 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    steps/select_feats.sh --cmd "$train_cmd --max-jobs-run 4" 10-39 data/${dataset}_hires_dbl2 data/${dataset}_hires_dbl2_select \
+          exp/make_dbl3/$dataset $mfccdir
+    rm data/${dataset}_hires_dbl2_select/cmvn.scp 2>/dev/null || true
+    steps/paste_feats.sh --cmd "$train_cmd --max-jobs-run 4" data/${dataset}_hires_dbl2_select data/${dataset}_dbl3 data/${dataset}_pasted \
+          exp/make_dbl3/$dataset $mfccdir
+    steps/compute_cmvn_stats.sh data/${dataset}_pasted exp/make_dbl3/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_pasted
+  done
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_pasted $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 17 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_pasted $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_pasted \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh
new file mode 100755
index 00000000000..b33f013b894
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh
@@ -0,0 +1,459 @@
+#!/bin/bash
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+# I ended up running it again after I suspected that we had 'got lucky' with
+# this particular run (since various experiments using 5v as a starting point
+# were failures); that rerun is the  5v2 run.
+#
+# local/chain/compare_wer.sh 5e 5s 5t 5v 5v2
+# System                       5e        5s        5t        5v       5v2
+# WER on train_dev(tg)      15.43     15.47     15.43     15.38     15.74
+# WER on train_dev(fg)      14.32     14.31     14.34     14.39     14.50
+# WER on eval2000(tg)        17.3      17.4      17.4      17.4      17.5
+# WER on eval2000(fg)        15.5      15.6      15.6      15.7      15.9
+# Final train prob      -0.110056 -0.110928 -0.110752  -0.11156 -0.112155
+# Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797 -0.129516
+
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5v # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
new file mode 100755
index 00000000000..1a40acfa105
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
@@ -0,0 +1,469 @@
+#!/bin/bash
+
+# _5w is as _5k (which is a fairly good-performing ivector-free model), but
+# making the same changes as 5e -> 5t, which makes the model more lightweight
+# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to
+# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
+# from 1800 to 1700.
+
+# Difference is tiny.
+#local/chain/compare_wer.sh 5k 5w
+#System                       5k        5w
+#WER on train_dev(tg)      16.46     16.56
+#WER on train_dev(fg)      15.17     15.30
+#WER on eval2000(tg)        18.1      18.1
+#WER on eval2000(fg)        16.5      16.4
+#Final train prob      -0.105502 -0.106549
+#Final valid prob       -0.12337 -0.120079
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5w # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
new file mode 100755
index 00000000000..88dc28c2354
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+
+# _5x is as _5w but decreasing the context of the averaging layer from +-0.99
+# seconds to +-0.66 seconds.  I would not have expected this to work a priori,
+# but the change from 5k -> 5l, which made the context wider, made WERs slightly
+# worse, so I'd like to see what happens when we decrease the context.
+
+# It's worse.  Odd because increasing the context (5k->5l) seemed to be a little
+# worse also.
+local/chain/compare_wer.sh 5w 5x
+#System                       5w        5x
+#WER on train_dev(tg)      16.56     16.66
+#WER on train_dev(fg)      15.30     15.41
+#WER on eval2000(tg)        18.1      18.5
+#WER on eval2000(fg)        16.4      16.6
+#Final train prob      -0.106549 -0.105693
+#Final valid prob      -0.120079 -0.121834
+
+# _5w is as _5k (which is a fairly good-performing ivector-free model), but
+# making the same changes as 5e -> 5t, which makes the model more lightweight
+# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to
+# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
+# from 1800 to 1700.
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_5w_sp/egs \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
new file mode 100755
index 00000000000..54769c23734
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
new file mode 100755
index 00000000000..94843bfa2c9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
@@ -0,0 +1,468 @@
+#!/bin/bash
+
+# _5z is as _5v, but adding skip-splicing (a new configuration option)
+# It seems not helpful.  I'll remove the option soon.
+# note: 5v2 is a rerun of 5v.
+
+# local/chain/compare_wer.sh 5v 5v2 5z
+# System                       5v       5v2        5z
+# WER on train_dev(tg)      15.38     15.74     15.60
+# WER on train_dev(fg)      14.39     14.50     14.50
+# WER on eval2000(tg)        17.4      17.5      17.6
+# WER on eval2000(fg)        15.7      15.9      15.9
+# Final train prob       -0.11156 -0.112155 -0.113823
+# Final valid prob      -0.131797 -0.129516 -0.131356
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
new file mode 100755
index 00000000000..c618d1c0adf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
@@ -0,0 +1,490 @@
+#!/bin/bash
+
+# _6a is as _5y, where we keep the hidden parts of the network a bit larger
+# but take the final-hidden-dim back up to 500, which is the same as what
+# it was in 5v.
+
+# No better.
+#local/chain/compare_wer.sh 5v 6a
+#System                       5v        6a
+#WER on train_dev(tg)      15.38     15.49
+#WER on train_dev(fg)      14.39     14.30
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.109471
+#Final valid prob      -0.131797 -0.129035
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh
new file mode 100755
index 00000000000..5cd3f7dfbf2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh
@@ -0,0 +1,480 @@
+#!/bin/bash
+
+# _6b is as _5y, where we keep the hidden parts of the network a bit larger
+# but take the final-hidden-dim back up to 500, which is the same as what
+# it was in 5v.
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
new file mode 100755
index 00000000000..7334a5e185e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
@@ -0,0 +1,468 @@
+#!/bin/bash
+
+# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden
+# layer inside jesus layer.
+
+# Note: 5v2 is a rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6c
+#System                       5v       5v2        6c
+#WER on train_dev(tg)      15.38     15.74     15.54
+#WER on train_dev(fg)      14.39     14.50     14.55
+#WER on eval2000(tg)        17.4      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.114084
+#Final valid prob      -0.131797 -0.129516 -0.129589
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --thick-jesus-layer true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
new file mode 100755
index 00000000000..80b6a18cabf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
@@ -0,0 +1,470 @@
+#!/bin/bash
+
+# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100).
+# this means (after rounding) that we have 6, not 5, as
+# --jesus-forward-input-dim / --num-jesus-blocks.
+
+# no clear difference.
+#[note, 5v2 is a rerun of 5v].
+# local/chain/compare_wer.sh 5v 5v2 6d
+# System                       5v       5v2        6d
+# WER on train_dev(tg)      15.38     15.74     15.66
+# WER on train_dev(fg)      14.39     14.50     14.54
+# WER on eval2000(tg)        17.4      17.5      17.5
+# WER on eval2000(fg)        15.7      15.9      15.8
+# Final train prob       -0.11156 -0.112155 -0.112034
+# Final valid prob      -0.131797 -0.129516 -0.131714
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--num-jesus-blocks 84 --jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh
new file mode 100755
index 00000000000..d44973db7ba
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh
@@ -0,0 +1,464 @@
+#!/bin/bash
+
+
+# _6e is as _6d but going further: reducing --num-jesus-blocks to 72 = ceil(500/7).
+
+#
+# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100).
+# this means (after rounding) that we have 6, not 5, as
+# --jesus-forward-input-dim / --num-jesus-blocks.
+
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--num-jesus-blocks 72 --jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh
new file mode 100755
index 00000000000..fb7ff03b66d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh
@@ -0,0 +1,470 @@
+#!/bin/bash
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+# note, 5v2 is a rerun of 5v.
+# local/chain/compare_wer.sh 5v 5v2 6f
+# System                       5v       5v2        6f
+# WER on train_dev(tg)      15.38     15.74     15.71
+# WER on train_dev(fg)      14.39     14.50     14.50
+# WER on eval2000(tg)        17.4      17.5      17.5
+# WER on eval2000(fg)        15.7      15.9      15.9
+# Final train prob       -0.11156 -0.112155 -0.111305
+# Final valid prob      -0.131797 -0.129516 -0.131487
+
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
new file mode 100755
index 00000000000..8d4e8b79fd0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
@@ -0,0 +1,491 @@
+#!/bin/bash
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# seems better than 6f, and about the same as (5v,5v2).  encouraging.
+# note, 5v2 is rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6f 6g
+#System                       5v       5v2        6f        6g
+#WER on train_dev(tg)      15.38     15.74     15.71     15.50
+#WER on train_dev(fg)      14.39     14.50     14.50     14.31
+#WER on eval2000(tg)        17.4      17.5      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.111305 -0.105853
+#Final valid prob      -0.131797 -0.129516 -0.131487 -0.129997
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
new file mode 100755
index 00000000000..f3065cec603
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
@@ -0,0 +1,494 @@
+#!/bin/bash
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh
new file mode 100755
index 00000000000..b0f38b9fb0f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# this is a replica of_6h script, but makes use of the python trainer
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6h_py # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+     steps/nnet3/make_jesus_configs.py \
+      --feat-dir data/${train_set}_hires \
+      --ivector-dir exp/nnet3/ivectors_${train_set} \
+      --tree-dir $treedir \
+      --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+      --jesus-forward-input-dim 600 \
+      --jesus-forward-output-dim 1700 \
+      --jesus-hidden-dim 0 \
+      --jesus-stddev-scale 0.2 \
+      --final-layer-learning-rate-factor 0.25  \
+      --self-repair-scale 0.00001 \
+      --xent-separate-forward-affine=true \
+      --xent-regularize=$xent_regularize \
+      --include-log-softmax=false \
+    $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir exp/chain/tdnn_2y_sp/egs \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh
new file mode 100755
index 00000000000..457b424be73
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh
@@ -0,0 +1,497 @@
+#!/bin/bash
+
+# _6i takes aspects from 5n and 6g.  Like 6g it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6g 6i
+# System                       6g        6i
+# WER on train_dev(tg)      15.50     15.62
+# WER on train_dev(fg)      14.31     14.46
+# WER on eval2000(tg)        17.5      17.3
+# WER on eval2000(fg)        15.8      15.8
+# Final train prob      -0.105853  -0.10417
+# Final valid prob      -0.129997 -0.123985
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6i # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh
new file mode 100755
index 00000000000..ded13de9470
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh
@@ -0,0 +1,482 @@
+#!/bin/bash
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+#Final train prob (xent)      -1.60566  -1.45908
+#Final valid prob (xent)      -1.67945  -1.55937
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6j # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh
new file mode 100755
index 00000000000..4625da200e6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh
@@ -0,0 +1,509 @@
+#!/bin/bash
+
+# _6k is as _6i, but one more epoch.  After running the first few stages, I'm
+# copying the last model from 6i and starting from that point, to save compute.
+# No better.
+#local/chain/compare_wer.sh 6i 6k
+#System                       6i        6k
+#WER on train_dev(tg)      15.62     15.67
+#WER on train_dev(fg)      14.46     14.47
+#WER on eval2000(tg)        17.3      17.4
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417-0.0994163
+#Final valid prob      -0.123985 -0.122743
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6h 6i
+# System                       6h        6i
+# WER on train_dev(tg)      15.46     15.62
+# WER on train_dev(fg)      14.28     14.46
+# WER on eval2000(tg)        17.4      17.3
+# WER on eval2000(fg)        15.7      15.8
+# Final train prob      -0.105663  -0.10417
+# Final valid prob      -0.130166 -0.123985
+
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6k # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6i_sp/egs \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh
new file mode 100755
index 00000000000..f1e0821f2cf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh
@@ -0,0 +1,521 @@
+#!/bin/bash
+
+# _6l is as _6i, but adding the option --xent-separate-forward-affine=true which
+# I had accidentally omitted, and adding 4 frames more left context and 2 frames
+# more right context.
+
+# Below I'm also comparing with 6h, which (since we now added
+# --xent-separate-forward-affine=true) is the appopriate normal-frame-rate
+# baseline, rather than 6g.
+
+# This experiment is better than 6i, but there is no clear difference with
+# 6h.  So we can't really say that we're getting any benefit from the higher
+# frame rate.
+
+#local/chain/compare_wer.sh 6h 6i 6l
+#System                       6h        6i        6l
+#WER on train_dev(tg)      15.46     15.62     15.42
+#WER on train_dev(fg)      14.28     14.46     14.25
+#WER on eval2000(tg)        17.4      17.3      17.3
+#WER on eval2000(fg)        15.7      15.8      15.8
+#Final train prob      -0.105663  -0.10417-0.0984719
+#Final valid prob      -0.130166 -0.123985 -0.119088
+#Final train prob (xent)      -1.42483  -1.60566  -1.46581
+#Final valid prob (xent)      -1.49792  -1.67945  -1.51644
+
+
+# _6i takes aspects from 5n and 6g.  Like 6g it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6g 6i
+# System                       6g        6i
+# WER on train_dev(tg)      15.50     15.62
+# WER on train_dev(fg)      14.31     14.46
+# WER on eval2000(tg)        17.5      17.3
+# WER on eval2000(fg)        15.8      15.8
+# Final train prob      -0.105853  -0.10417
+# Final valid prob      -0.129997 -0.123985
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6l # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{05,b11,b12,b13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001  --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2,4 -4,0,4 -4,0,4 -8,-4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh
new file mode 100755
index 00000000000..8a7b14ef342
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh
@@ -0,0 +1,497 @@
+#!/bin/bash
+
+# _6m is as _6j (which subsamples by 4 frames not 3 at the output), changing just the
+# --left-tolerance and --right-tolerance to be the same total width but more
+# symmetrical (-7,+8) vs the default (-5, +10).
+
+# this is unhelpful and if anything is a little worse.
+#local/chain/compare_wer.sh 6j 6m
+#System                       6j        6m
+#WER on train_dev(tg)      15.86     16.08
+#WER on train_dev(fg)      14.79     14.85
+#WER on eval2000(tg)        17.6      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob      -0.131444 -0.131515
+#Final valid prob      -0.167574  -0.17046
+#Final train prob (xent)      -1.45908  -1.43814
+#Final valid prob (xent)      -1.55937   -1.5412
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6m # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --left-tolerance 7 --right-tolerance 8 \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh
new file mode 100755
index 00000000000..625cb73cf50
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+# _6n is as _6m, but with a less-wide splicing context.
+
+# The effect is inconsistent- there is none, on average.
+#System                       6j        6m        6n
+#WER on train_dev(tg)      15.86     16.08     16.01
+#WER on train_dev(fg)      14.79     14.85     14.66
+#WER on eval2000(tg)        17.6      17.6      17.7
+#WER on eval2000(fg)        15.8      15.8      15.9
+#Final train prob      -0.131444 -0.131515 -0.133681
+#Final valid prob      -0.167574  -0.17046 -0.172072
+#Final train prob (xent)      -1.45908  -1.43814  -1.53108
+#Final valid prob (xent)      -1.55937   -1.5412  -1.65137
+
+# _6m is as _6j (which subsamples by 4 frames), changing just the
+# --left-tolerance and --right-tolerance to be the same total width but more
+# symmetrical (-7,+8) vs the default (-5, +10).
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6m_sp/egs \
+    --left-tolerance 7 --right-tolerance 8 \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -2,0,2 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh
new file mode 100755
index 00000000000..e07e6092644
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh
@@ -0,0 +1,509 @@
+#!/bin/bash
+
+# _6o is as _6h but halving the --l2-regularize option, because since the
+# time we last tuned this, other regularization methods have been added.
+
+#It's worse.
+#local/chain/compare_wer.sh 6h 6o
+#System                       6h        6o
+#WER on train_dev(tg)      15.46     15.61
+#WER on train_dev(fg)      14.28     14.58
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob      -0.105663-0.0992526
+#Final valid prob      -0.130166 -0.127421
+#Final train prob (xent)      -1.42483   -1.4369
+#Final valid prob (xent)      -1.49792  -1.49867
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6o # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.000025 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh
new file mode 100755
index 00000000000..a9f7eef9bbc
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh
@@ -0,0 +1,503 @@
+#!/bin/bash
+
+# _6p is as _6j, but increasing the various regularization coefficients.
+# the intention is to increase them by 4/3, since they are all evaluated
+# once per output frame, and there are now fewer output frames by a factor
+# of 3/4.  To make them rounder numbers, I increased some by a factor
+# of 5/4 (--xent-regularize, 0.1 -> 0.125, and --leaky-hmm-coefficient,
+# 0.1 -> 0.125), and l2-regularize by 3/2 (0.00005 -> 0.000075).
+
+# Worse.
+#local/chain/compare_wer.sh 6j 6p
+#System                       6j        6p
+#WER on train_dev(tg)      15.86     15.91
+#WER on train_dev(fg)      14.79     14.76
+#WER on eval2000(tg)        17.6      17.9
+#WER on eval2000(fg)        15.8      15.9
+#Final train prob      -0.131444 -0.143285
+#Final valid prob      -0.167574 -0.173759
+#Final train prob (xent)      -1.45908  -1.44287
+#Final valid prob (xent)      -1.55937  -1.52918
+
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+#Final train prob (xent)      -1.60566  -1.45908
+#Final valid prob (xent)      -1.67945  -1.55937
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6j_sp/egs \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.125 \
+    --leaky-hmm-coefficient 0.125 \
+    --l2-regularize 0.000075 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh
new file mode 100755
index 00000000000..440da3a1d6b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh
@@ -0,0 +1,493 @@
+#!/bin/bash
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+# (note, I forgot the self-repair-scale, and I probably should have used
+# 6h as the baseline because it has --xent-separate-forward-affine=true;
+# note, this experiment doesn't have --xent-separate-forward-affine=true but
+# it would have been better to have it (retrying as 6r)
+
+# we're about 0.2% better than 6g.
+#local/chain/compare_wer.sh 6g 6q
+#System                       6g        6q
+#WER on train_dev(tg)      15.50     15.25
+#WER on train_dev(fg)      14.31     14.24
+#WER on eval2000(tg)        17.5      17.2
+#WER on eval2000(fg)        15.8      15.6
+#Final train prob      -0.105853 -0.106936
+#Final valid prob      -0.129997 -0.123066
+#Final train prob (xent)       -1.4718  -1.66328
+#Final valid prob (xent)      -1.55129  -1.71979
+
+
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=13
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6q # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh
new file mode 100755
index 00000000000..ffbac19d1eb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh
@@ -0,0 +1,492 @@
+#!/bin/bash
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001
+# --xent-separate-forward-affine=true.  the appropriate normal-frame-rate
+# baseline for this is 6h (since it has --xent-separate-forward-affine=true),
+# so using that as the baseline:
+
+#local/chain/compare_wer.sh 6h 6r
+#System                       6h        6r
+#WER on train_dev(tg)      15.46     15.06
+#WER on train_dev(fg)      14.28     14.05
+#WER on eval2000(tg)        17.4      17.2
+#WER on eval2000(fg)        15.7      15.4
+#Final train prob      -0.105663 -0.106685
+#Final valid prob      -0.130166 -0.122293
+#Final train prob (xent)      -1.42483  -1.62108
+#Final valid prob (xent)      -1.49792  -1.67695
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6q_sp/egs \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh
new file mode 100755
index 00000000000..4693dde0a31
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh
@@ -0,0 +1,502 @@
+#!/bin/bash
+
+
+# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h,
+# but all multiplied by 2.  This means that for any given frame-shift, the network
+# sees exactly the same input as 6h; the only difference is that we see
+# more frame shifts, i.e. the data is more carefully perturbed than 6h.
+# this is to help disentangle whether the improvement really comes from the
+# higher-resolution features, or from the improved data shifting.
+
+# So we lose the improvement that we got in 6r (see below).  This is consistent
+# with the idea that we really do need the higher-frame-rate input,  but it's
+# also possible that some slight differences in the splicing indexes were
+# responsible, so in 6t we'll do an experiment where we try to get closer
+# to the splicing setup of 6r.
+#
+# local/chain/compare_wer.sh 6h 6r 6s
+#System                       6h        6r        6s
+#WER on train_dev(tg)      15.46     15.06     15.50
+#WER on train_dev(fg)      14.28     14.05     14.45
+#WER on eval2000(tg)        17.4      17.2      17.5
+#WER on eval2000(fg)        15.7      15.4      15.7
+#Final train prob      -0.105663 -0.106685 -0.105965
+#Final valid prob      -0.130166 -0.122293 -0.122376
+#Final train prob (xent)      -1.42483  -1.62108   -1.5454
+#Final valid prob (xent)      -1.49792  -1.67695  -1.58129
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-2,0,2 -2,0,2,4 -6,0,6 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh
new file mode 100755
index 00000000000..47921335155
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh
@@ -0,0 +1,512 @@
+#!/bin/bash
+
+# since _6s didn't work that well, in 6t we try something else:
+# modifying 6s to use almost exactly the same splicing indexes as 6r,
+# but with the first splice indexes changed from -1,0,1 to -1,1, so that
+# all the differences are multiples of 2 (so the effective frame rate is
+# the normal frame rate).  In effect we're using a narrower splicing
+# at the start of the nnet, than 6s.
+
+# 6t does seem better than 6s, but not quite as good as 6r.
+# the fact that it's not as good as 6r may show that the double-frame-rate
+# input was actually giving us some useful information-- although the
+# improvement is only something like 0.1%-0.2%, and we didn't actually see
+# any difference in the objective function from 6r, which undermines the
+# notion that by removing that central 0 splice at the input, we lost
+# some information.
+#
+#
+#local/chain/compare_wer.sh 6r 6s 6t
+#System                       6r        6s        6t
+#WER on train_dev(tg)      15.06     15.50     15.34
+#WER on train_dev(fg)      14.05     14.45     14.23
+#WER on eval2000(tg)        17.2      17.5      17.2
+#WER on eval2000(fg)        15.4      15.7      15.6
+#Final train prob      -0.106685 -0.105965 -0.106575
+#Final valid prob      -0.122293 -0.122376 -0.121902
+#Final train prob (xent)      -1.62108   -1.5454  -1.62226
+#Final valid prob (xent)      -1.67695  -1.58129  -1.67252
+
+# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h,
+# but all multiplied by 2.  This means that for any given frame-shift, the network
+# sees exactly the same input as 6h; the only differences is that we see
+# more frame shifts, i.e. the data is more carefully perturbed than 6h.
+# this is to help disentangle whether the improvement really comes from the
+# higher-resolution features, or from the improved data shifting.
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6s_sp/egs \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh
new file mode 100755
index 00000000000..4c48a75ffd6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh
@@ -0,0 +1,524 @@
+#!/bin/bash
+
+# _6u is as _6h, but with slightly different splicing indexes (start
+# narrower than 6h and ramp up slowly).  These are designed to be
+# equivalent to those in 6t, except for use with normal-frame-rate,
+# not double-frame-rate, input.  The difference between 6t and 6u
+# will show us whether having double-frame-rate input for the purpose
+# of getting more different shifted versions of the input, is helpful.
+# [however, note that the number of frames-per-iter is not comparable
+# between 6t and 6u: here we're using 1.2 million frames per eg,
+# and 6s is using 3 million which at the normal frame rate would be
+# 1.5 million, and 1.2 != 1.5.
+
+# 6u is no better than 6h, and maybe slightly worse.  Certainly it's worse than
+# 6t.  In addition, the train-valid difference is bigger with 6h and 6u than
+# with 6t.  This is all consistent with the notion that the higher-frame-rate
+# input, with with we can generate more shifted versions, does really make a
+# difference.  However, I want to wait till the 6v->6w comparison is ready,
+# which may let us know whether the difference in frames-per-iter could have
+# been a confounding factor here.  (It's unlikely, but possible).
+#
+#local/chain/compare_wer.sh 6h 6t 6u
+#System                       6h        6t        6u
+#WER on train_dev(tg)      15.46     15.34     15.46
+#WER on train_dev(fg)      14.28     14.23     14.28
+#WER on eval2000(tg)        17.4      17.2      17.6
+#WER on eval2000(fg)        15.7      15.6      15.9
+#Final train prob      -0.105663 -0.106575 -0.108665
+#Final valid prob      -0.130166 -0.121902 -0.129495
+#Final train prob (xent)      -1.42483  -1.62226  -1.54189
+#Final valid prob (xent)      -1.49792  -1.67252  -1.60749
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh
new file mode 100755
index 00000000000..158405a4058
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# _6v is as _6h, but moving to a TDNN+ReLU recipe instead of using jesus-layer.
+# Otherwise we make everything as similar as possible to 6h.
+# The ReLU dimension, at 576, is chosen to make the number of parameters about
+# the same as 6h.
+
+# great improvement!
+# local/chain/compare_wer.sh 6h 6v
+# System                       6h        6v
+# WER on train_dev(tg)      15.46     15.00
+# WER on train_dev(fg)      14.28     13.91
+# WER on eval2000(tg)        17.4      17.2
+# WER on eval2000(fg)        15.7      15.7
+
+# the following objf values are computed on the last iter (323), because due to
+# a script bug, now fixed, the 'final' ones were not computed in 6v.
+# note: in this run the xent learning rate was too slow.
+# 323 train prob        -0.129285     -0.120026
+# 323 valid prob        -0.151648     -0.140628
+# 323 train prob (xent)  -1.4443      -1.5431
+# 323 valid prob (xent)  -1.51731     -1.56975
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6v  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir exp/chain/tdnn_2y_sp/egs \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh
new file mode 100755
index 00000000000..3e3bb622290
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# I discovered after running this that there was a problem with the egs-dumping,
+# which seems to have existed for quite a while: the --right-tolerance defaults to 10
+# in the script, but it should have been 5, to match the code.  However, 6v was
+# run with older egs (before this bug was introduced) from 2y, so it doesn't
+# have the problem.
+
+# note regarding the changes in objfs: these have explanations, they are due to
+# the --right-tolerance increasing from 5->10 in 6v->6w: the chain objfs improve
+# because of the less-restrictive numerator graphs, and the xent objfs get worse
+# because the phone alignments become less consistent; we can see the reverse
+# pattern in 6y -> 6z when we revert the right-tolerance back to 5.
+#
+#local/chain/compare_wer.sh 6v  6w
+#System                       6v        6w
+#WER on train_dev(tg)      15.00     15.33
+#WER on train_dev(fg)      13.91     14.27
+#WER on eval2000(tg)        17.2      17.3
+#WER on eval2000(fg)        15.7      15.6
+#Final train prob      -0.105012  -0.10287
+#Final valid prob      -0.125877 -0.120451
+#Final train prob (xent)      -1.54736  -1.63586
+#Final valid prob (xent)      -1.57475  -1.67173
+
+
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6w  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh
new file mode 100755
index 00000000000..177ddd2a867
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# 6x is as 6w, but changing the splice-indexes to be like in 6u
+# except since this is a TDNN setup, we need a final "0" [the jesus-layer
+# setup had a final ReLU as a special case.].
+# These splice indexes start smaller, and ramp up more slowly, than
+# the baseline in 6w.
+# We're reusing the 6x egs.
+
+# no clear benefit; if anything, it's slightly worse.
+# local/chain/compare_wer.sh  6w 6x
+# System                       6w        6x
+# WER on train_dev(tg)      15.33     15.30
+# WER on train_dev(fg)      14.27     14.35
+# WER on eval2000(tg)        17.3      17.4
+# WER on eval2000(fg)        15.6      15.7
+# Final train prob       -0.10287 -0.103078
+# Final valid prob      -0.120451 -0.122477
+# Final train prob (xent)      -1.63586  -1.73292
+# Final valid prob (xent)      -1.67173  -1.75042
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6x  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --egs.dir exp/chain/tdnn_6w_sp/egs \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh
new file mode 100755
index 00000000000..a15c6648641
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# WER results are inconclusive, but objective values are encouraging.
+# We'll keep the change as it makes sense.
+# local/chain/compare_wer.sh 6w 6y
+# System                       6w        6y
+# WER on train_dev(tg)      15.33     15.36
+# WER on train_dev(fg)      14.27     14.19
+# WER on eval2000(tg)        17.3      17.2
+# WER on eval2000(fg)        15.6      15.8
+# Final train prob       -0.10287 -0.102139
+# Final valid prob      -0.120451 -0.119654
+# Final train prob (xent)      -1.63586  -1.55598
+# Final valid prob (xent)      -1.67173  -1.58821
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6y  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --egs.dir exp/chain/tdnn_6w_sp/egs \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh
new file mode 100755
index 00000000000..97cc1b83624
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+# definitely better than 6y- not clear if we have managed to get the same
+# results as 6v (could indicate that the larger frames-per-iter is not helpful?
+# but I'd rather not decrease it as it would hurt speed).
+
+# local/chain/compare_wer.sh 6v 6y 6z
+# System                       6v        6y        6z
+# WER on train_dev(tg)      15.00     15.36     15.18
+# WER on train_dev(fg)      13.91     14.19     14.06
+# WER on eval2000(tg)        17.2      17.2      17.2
+# WER on eval2000(fg)        15.7      15.8      15.6
+# Final train prob      -0.105012 -0.102139 -0.106268
+# Final valid prob      -0.125877 -0.119654 -0.126726
+# Final train prob (xent)      -1.54736  -1.55598  -1.4556
+# Final valid prob (xent)      -1.57475  -1.58821  -1.50136
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh
new file mode 100755
index 00000000000..95c3c9f4c24
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# 7a inherits from 6z (which is a TDNN+ReLU-based network with various small
+# bugs hopefully fixed now), and from 6r, which is our most-successful
+# double-frame-rate system.  We're re-dumping the egs, because the egs used in
+# 6r used right-tolerance=10, which turns out to have been a bug, and not a
+# helpful one.
+
+# it is not better than 6z.
+# local/chain/compare_wer.sh 6v 6z 7a
+#System                       6v        6z        7a
+#WER on train_dev(tg)      15.00     15.18     15.05
+#WER on train_dev(fg)      13.91     14.06     14.10
+#WER on eval2000(tg)        17.2      17.2      17.3
+#WER on eval2000(fg)        15.7      15.6      15.7
+#Final train prob      -0.105012 -0.106268 -0.110288
+#Final valid prob      -0.125877 -0.126726 -0.127071
+#Final train prob (xent)      -1.54736   -1.4556  -1.59569
+#Final valid prob (xent)      -1.57475  -1.50136  -1.62312
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7a  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=2 # use 2 not 4 epochs, as with the double-frame-rate input, we
+             # shift the input data in double the number of distinct ways
+             # on each epoch.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires_dbl \
+    --ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{7,11,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 6 \
+    --chain.alignment-subsampling-factor 3 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 300 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 3000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_dbl \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 17 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+          $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh
new file mode 100755
index 00000000000..8bde54f7eee
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# 7b is as 6z, but increasing the relu-dim slightly from 576 to 625.
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+# definitely better than 6y- not clear if we have managed to get the same
+# results as 6v (could indicate that the larger frames-per-iter is not helpful?
+# but I'd rather not decrease it as it would hurt speed).
+
+# local/chain/compare_wer.sh 6v 6y 6z
+# System                       6v        6y        6z
+# WER on train_dev(tg)      15.00     15.36     15.18
+# WER on train_dev(fg)      13.91     14.19     14.06
+# WER on eval2000(tg)        17.2      17.2      17.2
+# WER on eval2000(fg)        15.7      15.8      15.6
+# Final train prob      -0.105012 -0.102139 -0.106268
+# Final valid prob      -0.125877 -0.119654 -0.126726
+# Final train prob (xent)      -1.54736  -1.55598  -1.4556
+# Final valid prob (xent)      -1.57475  -1.58821  -1.50136
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7b  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=625
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir exp/chain/tdnn_6z_sp/egs \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/show_wer.sh b/egs/swbd/s5c/local/chain/show_wer.sh
new file mode 100755
index 00000000000..a82c4acf26d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/show_wer.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for l in $*; do
+  grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh
+done
+for l in $*; do
+  grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh
+done
+for l in $*; do
+  grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh
+done
+for l in $*; do
+  grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh
+done
diff --git a/egs/swbd/s5c/local/nnet2/run_nnet2.sh b/egs/swbd/s5c/local/nnet2/run_nnet2.sh
index 0872560337b..e83c587a006 100755
--- a/egs/swbd/s5c/local/nnet2/run_nnet2.sh
+++ b/egs/swbd/s5c/local/nnet2/run_nnet2.sh
@@ -5,7 +5,7 @@
 # units, on top of fMLLR features, on GPU.
 
 temp_dir=
-dir=exp/nnet2_5
+dir=nnet2_5
 has_fisher=true
 
 . ./cmd.sh
@@ -18,10 +18,10 @@ parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d exp/$dir/egs/storage ]; then
       # spread the egs over various machines. 
       utils/create_split_dir.pl \
-      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/exp/$dir/egs/storage exp/$dir/egs/storage
     fi
 
     steps/nnet2/train_pnorm_accel2.sh --parallel-opts "$parallel_opts" \
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh
new file mode 100755
index 00000000000..d46d5cc7238
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+stage=1
+train_stage=-10
+generate_alignments=true # false if doing ctc training
+speed_perturb=true
+speaker_perturb=true
+lpc_order=100
+filter_nj=30
+spkf_per_spk=3
+perturb_suffix=""
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p nnet3
+# perturbed data preparation
+train_set=train_nodup
+
+if $speed_perturb; then
+  perturb_suffix="_sp"
+fi
+
+if $speaker_perturb; then
+  perturb_suffix=$perturb_suffix"_fp"
+fi
+
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    echo "speed perturb the data"
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+
+    for datadir in train_nodup; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp_sp1
+      utils/perturb_data_dir_speed.sh 0.95 data/${datadir} data/temp_sp2
+      utils/perturb_data_dir_speed.sh 1.05 data/${datadir} data/temp_sp3
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp_sp4
+
+      utils/combine_data.sh data/${datadir}_temp_sp data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4
+      utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp
+      rm -r data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4
+
+      if [ "$speaker_perturb" == "true" ]; then
+        echo "speaker perturbation of data"
+        utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp_sp0
+        utils/combine_data.sh data/${datadir}_sp data/${datadir}_temp_sp data/temp_sp0
+        utils/fix_data_dir.sh data/${datadir}_sp
+        
+        # compute filter correspond to different speed perturbed speaker.
+        spk_filters=spkfilters
+        mkdir -p $spk_filters
+        utils/split_data.sh data/${datadir}_sp $filter_nj
+        echo $filter_nj > data/${datadir}_sp/num_filter_jobs
+
+        $decode_cmd JOB=1:$filter_nj data/${datadir}_sp/split$filter_nj/compute_filter.JOB.log \
+          compute-filter --lpc-order=$lpc_order scp:data/${datadir}_sp/split$filter_nj/JOB/wav.scp \
+            ark,scp:$spk_filters/spk_filter.JOB.ark,$spk_filters/spk_filter.JOB.scp || exit 1;
+        
+        # combine filters.scp files together 
+        for n in $(seq $filter_nj); do
+          cat $spk_filters/spk_filter.$n.scp || exit 1;
+        done > data/${datadir}_sp/spk_filter.scp
+        echo "Finished generating filters per speakers."
+
+        echo "Perturb data using speaker perturbation."
+        utils/perturb_data_signal_v2.sh $spkf_per_spk 'fp' data/${datadir}_sp data/${datadir}_temp_sp_fp
+        utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp_fp
+      fi
+
+      echo "perturb_suffix=$perturb_suffix "
+      mfccdir=mfcc_perturbed
+      echo "Generating features using perturbed data"
+      steps/make_mfcc.sh --cmd "$decode_cmd" --nj 50 \
+        data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_temp${perturb_suffix}
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}${perturb_suffix} data/${datadir}_temp${perturb_suffix} data/temp0
+      utils/fix_data_dir.sh data/${datadir}${perturb_suffix}
+      rm -r data/temp0 data/${datadir}_temp${perturb_suffix}
+    done
+  fi
+
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/train_nodup${perturb_suffix} data/lang_nosp exp/tri4 exp/tri4_ali_nodup${perturb_suffix} || exit 1
+  fi
+fi
+
+train_set=train_nodup${perturb_suffix}
+if [ $stage -le 3 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+  fi
+
+  # the 100k_nodup directory is copied seperately, as
+  # we want to use exp/tri2_ali_100k_nodup for lda_mllt training
+  # the main train directory might be speed_perturbed
+  for dataset in $train_set train_100k_nodup; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+
+    # scale the waveforms, this is useful as we don't use CMVN
+    data_dir=data/${dataset}_hires
+    cat $data_dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
+    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+  if false; then #300
+  for dataset in eval2000 train_dev; do
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+  fi #300
+  # Take the first 30k utterances (about 1/8th of the data) this will be used
+  # for the diagubm training
+  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
+  local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
+fi
+if false; then #400
+# ivector extractor training
+if [ $stage -le 5 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  # this decision is based on fisher_english
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5500 90000 data/train_100k_nodup_hires \
+    data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b
+fi
+
+if [ $stage -le 6 ]; then
+  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+  # iVector extractors can be sensitive to the amount of data, but this one has a
+  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+  # 100k subset (just under half the data).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+fi #400
+
+if [ $stage -le 8 ]; then
+  # We extract iVectors on all the train_nodup data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
+
+  for data_set in eval2000 train_dev; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
+  done
+fi
+
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/run_lstm.sh b/egs/swbd/s5c/local/nnet3/run_lstm.sh
index e53f3387fd4..11fc851cb71 100755
--- a/egs/swbd/s5c/local/nnet3/run_lstm.sh
+++ b/egs/swbd/s5c/local/nnet3/run_lstm.sh
@@ -20,16 +20,17 @@ has_fisher=true
 affix=
 speed_perturb=true
 common_egs_dir=
+reporting_email=
 
 # LSTM options
 splice_indexes="-2,-1,0,1,2 0 0"
 lstm_delay=" -1 -2 -3 "
 label_delay=5
 num_lstm_layers=3
-cell_dim=1280
+cell_dim=1024
 hidden_dim=1024
-recurrent_projection_dim=384
-non_recurrent_projection_dim=384
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
 chunk_width=20
 chunk_left_context=40
 chunk_right_context=0
@@ -55,7 +56,7 @@ frames_per_chunk=
 
 echo "$0 $@"  # Print the command line for logging
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
@@ -81,40 +82,62 @@ ali_dir=exp/tri4_ali_nodup$suffix
 local/nnet3/run_ivector_common.sh --stage $stage \
   --speed-perturb $speed_perturb || exit 1;
 
-if [ $stage -le 9 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
 
-  steps/nnet3/lstm/train.sh --stage $train_stage \
-    --label-delay $label_delay \
-    --lstm-delay "$lstm_delay" \
-    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
-    --num-chunk-per-minibatch $num_chunk_per_minibatch \
-    --samples-per-iter $samples_per_iter \
-    --splice-indexes "$splice_indexes" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
-    --momentum $momentum \
-    --cmd "$decode_cmd" \
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
     --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
     --cell-dim $cell_dim \
     --hidden-dim $hidden_dim \
     --recurrent-projection-dim $recurrent_projection_dim \
     --non-recurrent-projection-dim $non_recurrent_projection_dim \
-    --chunk-width $chunk_width \
-    --chunk-left-context $chunk_left_context \
-    --chunk-right-context $chunk_right_context \
-    --egs-dir "$common_egs_dir" \
-    --remove-egs $remove_egs \
-    data/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
 fi
 
-graph_dir=exp/tri4/graph_sw1_tg
 if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=100 \
+    --use-gpu=true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 11 ]; then
   if [ -z $extra_left_context ]; then
     extra_left_context=$chunk_left_context
   fi
@@ -129,7 +152,7 @@ if [ $stage -le 10 ]; then
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       steps/nnet3/lstm/decode.sh --nj 250 --cmd "$decode_cmd" \
           --extra-left-context $extra_left_context  \
-	  --extra-right-context $extra_right_context  \
+          --extra-right-context $extra_right_context  \
           --frames-per-chunk "$frames_per_chunk" \
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg || exit 1;
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn.sh b/egs/swbd/s5c/local/nnet3/run_tdnn.sh
index 448b5bd174c..5254bc31857 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn.sh
@@ -11,10 +11,13 @@
 # --num-threads 16 and --minibatch-size 128.
 
 stage=0
+affix=
 train_stage=-10
 has_fisher=true
 speed_perturb=true
-
+common_egs_dir=
+reporting_email=
+remove_egs=true
 
 . cmd.sh
 . ./path.sh
@@ -41,26 +44,52 @@ ali_dir=exp/tri4_ali_nodup$suffix
 
 local/nnet3/run_ivector_common.sh --stage $stage \
 	--speed-perturb $speed_perturb || exit 1;
+
 if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --relu-dim 1024 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 10 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 2 --num-jobs-initial 3 --num-jobs-final 16 \
-    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.0017 --final-effective-lrate 0.00017 \
-    --cmd "$decode_cmd" \
-    --relu-dim 1024 \
-    data/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
 fi
 
 graph_dir=exp/tri4/graph_sw1_tg
-if [ $stage -le 10 ]; then
+if [ $stage -le 11 ]; then
   for decode_set in train_dev eval2000; do
     (
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh
index 3bce900aecf..26d77d10f15 100755
--- a/egs/swbd/s5c/local/score_sclite.sh
+++ b/egs/swbd/s5c/local/score_sclite.sh
@@ -50,7 +50,11 @@ if $reverse; then
   reorder_opt="--reorder=false"
 fi
 
-if [ -f $dir/../frame_subsampling_factor ]; then
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
   factor=$(cat $dir/../frame_subsampling_factor) || exit 1
   frame_shift_opt="--frame-shift=0.0$factor"
   echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh
index 00ec97c5028..d8f076b5141 100755
--- a/egs/swbd/s5c/local/swbd1_data_download.sh
+++ b/egs/swbd/s5c/local/swbd1_data_download.sh
@@ -10,18 +10,11 @@
 ## you unpacked this.  We are just doing a "find" command to locate
 ## the .sph files.
 
-## The second input is optional, which should point to a directory containing
-## Switchboard transcriptions/documentations (specifically, the conv.tab file).
-## If specified, the script will try to use the actual speaker PINs provided 
-## with the corpus instead of the conversation side ID (Kaldi default). We 
-## will be using "find" to locate this file so we don't make any assumptions
-## on the directory structure. (Peng Qi, Aug 2014)
-
 . path.sh
 
 #check existing directories
-if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -30,24 +23,19 @@ SWBD_DIR=$1
 dir=data/local/train
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
   exit 1; 
 fi  
 
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-[ ! -x $sph2pipe ] \
-  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
-
-
 # Trans directory check
 if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
   ( 
     cd $dir;
     if [ ! -d swb_ms98_transcriptions ]; then
       echo " *** Downloading trascriptions and dictionary ***" 
+      wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
       wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
       tar -xf switchboard_word_alignments.tar.gz
     fi
diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh
index 57fb0ff56c8..9621e7fc06e 100755
--- a/egs/swbd/s5c/local/swbd1_data_prep.sh
+++ b/egs/swbd/s5c/local/swbd1_data_prep.sh
@@ -21,7 +21,7 @@
 
 #check existing directories
 if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
   exit 1; 
 fi 
 
@@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  ( 
-    cd $dir;
-    if [ ! -d swb_ms98_transcriptions ]; then
-      echo " *** Downloading trascriptions and dictionary ***" 
-      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-      tar -xf switchboard_word_alignments.tar.gz
-    fi
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;
diff --git a/egs/tedlium/s5/RESULTS b/egs/tedlium/s5/RESULTS
index 9c494712aa8..0c209bddf7e 100644
--- a/egs/tedlium/s5/RESULTS
+++ b/egs/tedlium/s5/RESULTS
@@ -7,6 +7,27 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; d
   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
 exit 0
 
+
+#---------------------------------Current results (after fixing the <unk> problem)---------------------------------
+# There was a problem with the language model preparation where the scripts expected <UNK> to represent OOV words while
+# the language model used <unk> to represent them. See `git log tedlium-unk-fix` for details.
+# Fixing this causes a small decrease in WER.
+
+# GMMs
+# DEV SPEAKERS:
+%WER 31.0 | 507 17792 | 73.5 20.2 6.3 4.5 31.0 97.2 | -0.032 | exp/tri1/decode_nosp_dev/score_11_0.0/ctm.filt.filt.sys
+%WER 26.4 | 507 17792 | 77.8 16.7 5.5 4.2 26.4 95.5 | -0.066 | exp/tri2/decode_nosp_dev/score_13_0.0/ctm.filt.filt.sys
+%WER 26.1 | 507 17792 | 77.2 16.3 6.5 3.4 26.1 95.5 | -0.106 | exp/tri2/decode_dev/score_14_1.0/ctm.filt.filt.sys
+%WER 22.0 | 507 17792 | 81.6 13.2 5.2 3.6 22.0 93.9 | -0.189 | exp/tri3/decode_dev/score_13_1.0/ctm.filt.filt.sys
+
+# TEST SPEAKERS:
+%WER 30.9 | 1155 27512 | 72.1 21.0 6.9 3.0 30.9 94.5 | 0.035 | exp/tri1/decode_nosp_test/score_12_0.5/ctm.filt.filt.sys
+%WER 25.5 | 1155 27512 | 78.0 17.4 4.6 3.6 25.5 92.8 | -0.034 | exp/tri2/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys
+%WER 24.9 | 1155 27512 | 78.3 16.7 5.0 3.2 24.9 93.0 | -0.020 | exp/tri2/decode_test/score_14_0.5/ctm.filt.filt.sys
+%WER 20.3 | 1155 27512 | 82.7 13.4 3.9 3.0 20.3 90.0 | -0.063 | exp/tri3/decode_test/score_14_0.5/ctm.filt.filt.sys
+
+#---------------------------------(Pre-<unk> fix for Cantab LM) Provided for reference----------------------------------
+
 # Results from Nikolay, using kaldi scoring:
 # %WER 35.17 [ 9677 / 27512, 1267 ins, 1681 del, 6729 sub ] exp/tri1/decode/wer_13
 # %WER 30.03 [ 8262 / 27512, 1255 ins, 1367 del, 5640 sub ] exp/tri2/decode/wer_15
diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh
index bed97d34020..ba7f120e599 100644
--- a/egs/tedlium/s5/cmd.sh
+++ b/egs/tedlium/s5/cmd.sh
@@ -19,7 +19,7 @@ host=$(hostname -f)
 if [ ${host#*.} == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
diff --git a/egs/tedlium/s5/local/prepare_dict.sh b/egs/tedlium/s5/local/prepare_dict.sh
index a3207de050a..fcb03ea7aef 100755
--- a/egs/tedlium/s5/local/prepare_dict.sh
+++ b/egs/tedlium/s5/local/prepare_dict.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 #            2014 Brno University of Technology (Author: Karel Vesely)
+#            2016 Daniel Galvez
 # Apache 2.0
 #
 
@@ -13,10 +14,11 @@ srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct
 [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
 
 # Join dicts and fix some troubles
-cat $srcdict | grep -v "<s>" | grep -v "</s>" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt 
+cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
+  LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
 
 cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
-  grep -v SIL | sort > $dir/nonsilence_phones.txt  
+  grep -v SIL | sort > $dir/nonsilence_phones.txt
 
 ( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $dir/silence_phones.txt
 
@@ -27,9 +29,11 @@ echo SIL > $dir/optional_silence.txt
 echo -n >$dir/extra_questions.txt
 
 # Add to the lexicon the silences, noises etc.
+# Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
+# use <unk> instead of <UNK> to represent out of vocabulary words.
 (echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH';
  echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH'
- echo '<UNK> NSN' ) | \
+ echo '<unk> NSN' ) | \
  cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
 
 # Check that the dict dir is okay!
diff --git a/egs/tedlium/s5/run.sh b/egs/tedlium/s5/run.sh
index 7a36e49e8e0..e1dbf7b80e0 100755
--- a/egs/tedlium/s5/run.sh
+++ b/egs/tedlium/s5/run.sh
@@ -9,7 +9,7 @@
 # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
 # which allow free non-commercial use, while only a citation is required.
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 #            2014 Brno University of Technology (Author: Karel Vesely)
 # Apache 2.0
 #
@@ -28,17 +28,18 @@ stage=0
 # Data preparation
 if [ $stage -le 0 ]; then
   local/download_data.sh || exit 1
-  
+
   local/prepare_data.sh || exit 1
 
   local/prepare_dict.sh || exit 1
 
   utils/prepare_lang.sh data/local/dict_nosp \
-    "<UNK>" data/local/lang_nosp data/lang_nosp || exit 1
+    "<unk>" data/local/lang_nosp data/lang_nosp || exit 1
 
   local/prepare_lm.sh || exit 1
 
 fi
+
 # Feature extraction
 feat_dir=$pwd/data/mfcc_features
 if [ $stage -le 1 ]; then
@@ -100,7 +101,7 @@ if [ $stage -le 5 ]; then
     data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \
     exp/tri2/sil_counts_nowb.txt \
     exp/tri2/pron_bigram_counts_nowb.txt data/local/dict
-  
+
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
   cp -rT data/lang data/lang_test
   cp -rT data/lang data/lang_rescore
@@ -134,6 +135,8 @@ if [ $stage -le 6 ]; then
     exp/tri3/graph data/test exp/tri3/decode_test || exit 1
 fi
 
+# steps/cleanup/debug_lexicon.sh --nj 100 --alidir exp/tri3 --cmd "$train_cmd" data/train data/lang exp/tri3 data/local/dict/lexicon.txt exp/tri3_debug_lexicon &
+
 if [ $stage -le 7 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang exp/tri3 exp/tri3_ali || exit 1
diff --git a/egs/thchs30/README.txt b/egs/thchs30/README.txt
new file mode 100644
index 00000000000..acbdea4a263
--- /dev/null
+++ b/egs/thchs30/README.txt
@@ -0,0 +1,10 @@
+THCHS30 is an open Chinese speech database published by Center for Speech and Language Technology (CSLT) at Tsinghua University.
+
+The origional recording was conducted in 2002 by Dong Wang, supervised by Prof. Xiaoyan Zhu, at the Key State Lab of Intelligence and System, Department of Computer Science, Tsinghua Universeity, and the original name was 'TCMSD', standing for 'Tsinghua Continuous Mandarin Speech Database'. The publication after 13 years has been initiated by Dr. Dong Wang and was supported by Prof. Xiaoyan Zhu. We hope to provide a toy database for new researchers in the field of speech recognition. Therefore, the database is totally free to academic users.
+
+The database can be downloaded from openslr:
+http://www.openslr.org/18/
+
+or from the CSLT server:
+http://data.cslt.org/thchs30/README.html
+
diff --git a/egs/thchs30/s5/RESULTS b/egs/thchs30/s5/RESULTS
new file mode 100644
index 00000000000..70718ea4c2a
--- /dev/null
+++ b/egs/thchs30/s5/RESULTS
@@ -0,0 +1,61 @@
+#!/bin/bash
+for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_phone* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean mono,tri1,tri2b,tri3b,GMM,DNN model
+#clean test data
+#phone task
+%WER 31.49 [ 113986 / 362027, 20820 ins, 22043 del, 71123 sub ] exp/mono/decode_test_phone/wer_5
+%WER 20.56 [ 74445 / 362027, 15452 ins, 12457 del, 46536 sub ] exp/tri1/decode_test_phone/wer_5
+%WER 17.32 [ 62689 / 362027, 11937 ins, 11260 del, 39492 sub ] exp/tri2b/decode_test_phone/wer_6
+%WER 18.06 [ 65368 / 362027, 10426 ins, 13780 del, 41162 sub ] exp/tri3b/decode_test_phone/wer_5
+%WER 18.50 [ 66984 / 362027, 13117 ins, 11917 del, 41950 sub ] exp/tri3b/decode_test_phone.si/wer_5
+%WER 16.17 [ 58544 / 362027, 9628 ins, 11746 del, 37170 sub ] exp/tri4b/decode_test_phone/wer_6
+%WER 16.59 [ 60060 / 362027, 11440 ins, 10477 del, 38143 sub ] exp/tri4b/decode_test_phone.si/wer_6
+%WER 10.27 [ 37173 / 362027, 8675 ins, 6483 del, 22015 sub ] exp/tri4b_dnn/decode_test_phone/wer_4
+%WER 10.11 [ 36591 / 362027, 8702 ins, 6255 del, 21634 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it1/wer_4
+%WER 10.03 [ 36321 / 362027, 7490 ins, 6731 del, 22100 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it2/wer_5
+%WER 10.01 [ 36249 / 362027, 7507 ins, 6677 del, 22065 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it3/wer_5
+
+exit 0
+
+for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_word* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean mono,tri1,tri2b,tri3b,GMM,DNN model
+#clean test data
+#word task
+%WER 51.04 [ 41414 / 81139, 474 ins, 2404 del, 38536 sub ] exp/mono/decode_test_word/wer_9
+%WER 36.38 [ 29522 / 81139, 516 ins, 1096 del, 27910 sub ] exp/tri1/decode_test_word/wer_10
+%WER 32.51 [ 26379 / 81139, 469 ins, 940 del, 24970 sub ] exp/tri2b/decode_test_word/wer_9
+%WER 31.65 [ 25684 / 81139, 340 ins, 1085 del, 24259 sub ] exp/tri3b/decode_test_word/wer_9
+%WER 34.07 [ 27643 / 81139, 443 ins, 1100 del, 26100 sub ] exp/tri3b/decode_test_word.si/wer_10
+%WER 29.64 [ 24052 / 81139, 341 ins, 929 del, 22782 sub ] exp/tri4b/decode_test_word/wer_11
+%WER 31.71 [ 25732 / 81139, 472 ins, 902 del, 24358 sub ] exp/tri4b/decode_test_word.si/wer_10
+%WER 23.57 [ 19123 / 81139, 419 ins, 585 del, 18119 sub ] exp/tri4b_dnn/decode_test_word/wer_7
+%WER 23.40 [ 18984 / 81139, 397 ins, 567 del, 18020 sub ] exp/tri4b_dnn_mpe/decode_test_word_it1/wer_7
+%WER 23.27 [ 18884 / 81139, 396 ins, 553 del, 17935 sub ] exp/tri4b_dnn_mpe/decode_test_word_it2/wer_7
+%WER 23.18 [ 18804 / 81139, 368 ins, 618 del, 17818 sub ] exp/tri4b_dnn_mpe/decode_test_word_it3/wer_8
+
+exit 0
+
+for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_phone_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean MPE model and mixture DAE model
+#0db noise test data
+#phone task
+%WER 84.01 [ 304141 / 362027, 717 ins, 275948 del, 27476 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/white/wer_4
+%WER 14.11 [ 51074 / 362027, 10941 ins, 8175 del, 31958 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/car/wer_5
+%WER 71.63 [ 259329 / 362027, 6164 ins, 217508 del, 35657 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/cafe/wer_4
+%WER 40.04 [ 144946 / 362027, 17764 ins, 35162 del, 92020 sub ] exp/tri4b_dnn_dae/decode_phone_0db/white/wer_6
+%WER 11.81 [ 42773 / 362027, 9598 ins, 7552 del, 25623 sub ] exp/tri4b_dnn_dae/decode_phone_0db/car/wer_5
+%WER 32.39 [ 117256 / 362027, 17793 ins, 27750 del, 71713 sub ] exp/tri4b_dnn_dae/decode_phone_0db/cafe/wer_6
+exit 0
+
+for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_word_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean MPE model and mixture DAE model
+#0db noise test data
+#word task
+%WER 98.56 [ 79973 / 81139, 15 ins, 64293 del, 15665 sub ] exp/tri4b_dnn_mpe/decode_word_0db/white/wer_4
+%WER 28.10 [ 22799 / 81139, 553 ins, 661 del, 21585 sub ] exp/tri4b_dnn_mpe/decode_word_0db/car/wer_8
+%WER 85.58 [ 69438 / 81139, 321 ins, 49066 del, 20051 sub ] exp/tri4b_dnn_mpe/decode_word_0db/cafe/wer_8
+%WER 65.23 [ 52923 / 81139, 827 ins, 4198 del, 47898 sub ] exp/tri4b_dnn_dae/decode_word_0db/white/wer_13
+%WER 25.12 [ 20379 / 81139, 444 ins, 676 del, 19259 sub ] exp/tri4b_dnn_dae/decode_word_0db/car/wer_9
+%WER 53.38 [ 43308 / 81139, 907 ins, 4164 del, 38237 sub ] exp/tri4b_dnn_dae/decode_word_0db/cafe/wer_12
+
+exit 0
diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh
new file mode 100644
index 00000000000..1d8e768790f
--- /dev/null
+++ b/egs/thchs30/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/thchs30/s5/conf/decode_dnn.config b/egs/thchs30/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/thchs30/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/thchs30/s5/conf/fbank.conf b/egs/thchs30/s5/conf/fbank.conf
new file mode 100644
index 00000000000..8e6e36c69cf
--- /dev/null
+++ b/egs/thchs30/s5/conf/fbank.conf
@@ -0,0 +1,3 @@
+# No non-default options for now.
+#--sample-frequency=8000
+--num-mel-bins=40
diff --git a/egs/thchs30/s5/conf/mfcc.conf b/egs/thchs30/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..47d6c48bfe5
--- /dev/null
+++ b/egs/thchs30/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+#--sample-frequency=8000
diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py
new file mode 100755
index 00000000000..33e8a297aef
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/add-noise-mod.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# Copyright 2016  Tsinghua University (Author: Chao Liu, Dong Wang).  Apache 2.0.
+
+
+from __future__ import print_function
+import optparse
+import random
+import bisect
+import re
+import logging
+import wave
+import math
+import struct
+import sys
+import os
+
+try:
+  import pyximport; pyximport.install()
+  from thchs30_util import *
+except:
+  print("Cython possibly not installed, using standard python code. The process might be slow", file=sys.stderr)
+
+  def energy(mat):
+    return float(sum([x * x for x in mat])) / len(mat)
+
+  def mix(mat, noise, pos, scale):
+    ret = []
+    l = len(noise)
+    for i in xrange(len(mat)):
+        x = mat[i]
+        d = int(x + scale * noise[pos])
+        #if d > 32767 or d < -32768:
+        #    logging.debug('overflow occurred!')
+        d = max(min(d, 32767), -32768)
+        ret.append(d)
+        pos += 1
+        if pos == l:
+            pos = 0
+    return (pos, ret)
+
+
+def dirichlet(params):
+    samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params]
+    samples = [x / sum(samples) for x in samples]
+    for x in xrange(1, len(samples)):
+        samples[x] += samples[x - 1]
+    return bisect.bisect_left(samples, random.random())
+
+def wave_mat(wav_filename):
+    f = wave.open(wav_filename, 'r')
+    n = f.getnframes()
+    ret = f.readframes(n)
+    f.close()
+    return list(struct.unpack('%dh' % n, ret))
+
+def num_samples(mat):
+    return len(mat)
+
+def scp(scp_filename):
+    with open(scp_filename) as f:
+        for l in f:
+            yield tuple(l.strip().split())
+
+def wave_header(sample_array, sample_rate):
+  byte_count = (len(sample_array)) * 2 # short
+  # write the header
+  hdr = struct.pack('<ccccIccccccccIHHIIHH',
+    'R', 'I', 'F', 'F',
+    byte_count + 0x2c - 8,  # header size
+    'W', 'A', 'V', 'E', 'f', 'm', 't', ' ',
+    0x10,  # size of 'fmt ' header
+    1,  # format 1
+    1,  # channels
+    sample_rate,  # samples / second
+    sample_rate * 2,  # bytes / second
+    2,  # block alignment
+    16)  # bits / sample
+  hdr += struct.pack('<ccccI',
+    'd', 'a', 't', 'a', byte_count)
+  return hdr
+
+
+def output(tag, mat):
+    sys.stdout.write(tag + ' ')
+    sys.stdout.write(wave_header(mat, 16000))
+    sys.stdout.write(struct.pack('%dh' % len(mat), *mat))
+
+def output_wave_file(dir, tag, mat):
+    with open('%s/%s.wav' % (dir,tag), 'w') as f:
+        f.write(wave_header(mat, 16000))
+        f.write(struct.pack('%dh' % len(mat), *mat))
+
+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('--noise-level', type=float, help='')
+    parser.add_option('--noise-src', type=str, help='')
+    parser.add_option('--noise-prior', type=str, help='')
+    parser.add_option('--seed', type=int, help='')
+    parser.add_option('--sigma0', type=float, help='')
+    parser.add_option('--wav-src', type=str, help='')
+    parser.add_option('--verbose', type=int, help='')
+    parser.add_option('--wavdir', type=str, help='')
+    (args, dummy) = parser.parse_args()
+    random.seed(args.seed)
+    params = [float(x) for x in args.noise_prior.split(',')]
+    
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    global noises
+    noise_energies = [0.]
+    noises = [(0, [])]
+    for tag, wav in scp(args.noise_src):
+        logging.debug('noise wav: %s', wav)
+        mat = wave_mat(wav)
+        e = energy(mat)
+        logging.debug('noise energy: %f', e)
+        noise_energies.append(e)
+        noises.append((0, mat))
+        
+    for tag, wav in scp(args.wav_src):
+        logging.debug('wav: %s', wav)
+        noise_level = random.gauss(args.noise_level, args.sigma0)
+        logging.debug('noise level: %f', noise_level)
+        mat = wave_mat(wav)
+        signal = energy(mat)
+        logging.debug('signal energy: %f', signal)
+        noise = signal / (10 ** (noise_level / 10.))
+        logging.debug('noise energy: %f', noise)
+        type = dirichlet(params)
+        logging.debug('selected type: %d', type)
+        if type == 0:
+            if args.wavdir != 'NULL':
+               output_wave_file(args.wavdir, tag, mat)
+            else:
+               output(tag, mat)
+        else:
+            p,n = noises[type]
+            if p+len(mat) > len(n):
+                noise_energies[type] = energy(n[p::]+n[0:len(n)-p:])
+            else:
+                noise_energies[type] = energy(n[p:p+len(mat):])
+            scale = math.sqrt(noise / noise_energies[type])
+            logging.debug('noise scale: %f', scale)
+            pos, result = mix(mat, n, p, scale)
+            noises[type] = (pos, n)
+            if args.wavdir != 'NULL':
+                output_wave_file(args.wavdir, tag, result)
+            else:
+                output(tag, result)
+
+if __name__ == '__main__':
+    main()
+
+
+
diff --git a/egs/thchs30/s5/local/dae/run_dae.sh b/egs/thchs30/s5/local/dae/run_dae.sh
new file mode 100755
index 00000000000..f6a6db3a01a
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/run_dae.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#Conducts experiments of dae-based denoisng 
+
+stage=0
+nj=8
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+. utils/parse_options.sh || exit 1;
+
+thchs=$1
+
+#generate noisy data. We focuse on the 0db condition.
+#For training set, generate noisy data with SNR mean=0, variance=10, with three noise types mixed together.  
+#For dev, generate noisy data with SNR mean=0, variance=0, with three niose types mixed together
+#For test, use the standard test data which were generated by SNR mean=0, variance=0.
+
+if [ $stage =  0 ]; then
+   #generat noise.scp
+   mkdir -p data/dae/noise && \
+   awk '{print $1 " '$thchs'/resource/noise/"$2}' $thchs/resource/noise/noise.scp >  data/dae/noise/noise.scp || exit 1
+
+   echo "DAE: generate training data..."
+   noise_scp=data/dae/noise/noise.scp
+   noise_prior="0.0,10.0,10.0,10.0" #define noise type to sample. [S_clean, S_white, S_car, S_cafe]
+   noise_level=0 #0db condition
+   sigma0=10 #some random in SNR
+   seed=32
+   verbose=0
+   wavdir=wav/dae/train 
+   rm -rf data/dae/train && mkdir -p data/dae/train || exit 1
+   cp data/fbank/train/{spk2utt,utt2spk,text} data/dae/train || exit 1
+   mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/train/wav.scp > data/dae/train/wav.scp || exit 1
+
+   mkdir -p exp/dae/gendata 
+   split_scps=""
+   for n in $(seq $nj); do
+      split_scps="$split_scps exp/dae/gendata/train_split_${n}.scp"
+   done
+   utils/split_scp.pl data/fbank/train/wav.scp  $split_scps || exit 1
+   $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_train.JOB.log \
+     local/dae/add-noise-mod.py --noise-level $noise_level \
+       --sigma0 $sigma0 --seed $seed --verbose $verbose \
+       --noise-prior $noise_prior --noise-src $noise_scp \
+       --wav-src exp/dae/gendata/train_split_JOB.scp --wavdir $wavdir \
+       || exit 1
+
+   steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+     data/dae/train exp/dae/gendata fbank/dae/train || exit 1
+   steps/compute_cmvn_stats.sh data/dae/train exp/dae/cmvn \
+     fbank/dae/train || exit 1
+
+   #genreate dev data. Just the 0db condition is produced.  Multiple noise types mixed together.
+   echo "DAE: generating dev data..."
+   wavdir=wav/dae/dev/0db
+   sigma0=0 #no random in SNR
+   rm -rf data/dae/dev/0db && mkdir -p data/dae/dev/0db && \
+   cp -L data/fbank/dev/{spk2utt,utt2spk,text} data/dae/dev/0db || exit 1
+   mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/dev/wav.scp > data/dae/dev/0db/wav.scp || exit 1
+
+   split_scps=""
+   for n in $(seq $nj); do
+      split_scps="$split_scps exp/dae/gendata/dev_split_${n}.scp"
+   done
+   utils/split_scp.pl data/fbank/dev/wav.scp  $split_scps || exit 1
+
+   $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_dev.JOB.log \
+     local/dae/add-noise-mod.py --noise-level $noise_level \
+       --sigma0 $sigma0 --seed $seed --verbose $verbose \
+       --noise-prior $noise_prior --noise-src $noise_scp \
+       --wav-src exp/dae/gendata/dev_split_JOB.scp --wavdir $wavdir \
+       || exit 1
+   steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+     data/dae/dev/0db exp/dae/gendata fbank/dae/dev/0db || exit 1
+   steps/compute_cmvn_stats.sh data/dae/dev/0db exp/dae/cmvn \
+     fbank/dae/dev/0db || exit 1
+
+   #generate test data. Assume it has been downloaded in $thchs/test-noise
+   echo "DAE: generating test data..."
+   #generate fbank
+   for x in car white cafe; do
+     echo "producing fbanks for $x"
+     mkdir -p data/dae/test/0db/$x && \
+     cp -L data/fbank/test/{spk2utt,utt2spk,text} data/dae/test/0db/$x && \
+     awk '{print $1 " '$thchs'/test-noise/0db/'$x'/"$1".wav"}' data/fbank/test/wav.scp > data/dae/test/0db/$x/wav.scp || exit 1
+     steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+       data/dae/test/0db/$x exp/dae/gendata fbank/dae/test/0db/$x || exit 1
+     echo "generating cmvn for test data $x"
+     steps/compute_cmvn_stats.sh data/dae/test/0db/$x exp/dae/cmvn \
+       fbank/dae/test/0db/$x || exit 1
+     cp -R data/dae/test/0db/$x data/dae/test/0db/${x}_phone && cp data/test/phone.txt data/dae/test/0db/${x}_phone/text || exit 1
+   done
+fi
+
+#DAE training
+if [ $stage -le 1 ]; then
+  #train dnn dae using data with mixed noise
+  #produce merged feats.scp as --labels for both training and cv
+  dir=exp/tri4b_dnn_dae && mkdir -p exp/tri4b_dnn_dae || exit 1
+  cat data/fbank/train/feats.scp data/fbank/dev/feats.scp | sort -u  > $dir/tgt_feats.scp
+  cat data/fbank/train/cmvn.scp data/fbank/dev/cmvn.scp   | sort -u  > $dir/tgt_cmvn.scp
+
+  num_fea=$(feat-to-dim scp:$dir/tgt_feats.scp -)
+  echo "num_fea = $num_fea"
+
+  $cuda_cmd exp/tri4b_dnn_dae/log/train_nnet.log \
+    steps/nnet/train.sh --hid-layers 2 --hid-dim 1200 \
+      --cmvn-opts "--norm-vars=false"  --splice 10 \
+      --learn-rate 0.0001 \
+      --train_tool_opts "--objective-function=mse" \
+      --copy_feats false \
+      --labels "ark:copy-feats scp:$dir/tgt_feats.scp ark:- | apply-cmvn --norm-vars=false scp:$dir/tgt_cmvn.scp ark:- ark:- | feat-to-post ark:- ark:-|" \
+      --num-tgt  $num_fea \
+      --proto-opts '--no-softmax '  \
+      data/dae/train data/dae/dev/0db data/lang  \
+      data/fbank/train  data/fbank/dev  \
+      exp/tri4b_dnn_dae || exit 1;
+  nnet-concat exp/tri4b_dnn_dae/final.feature_transform exp/tri4b_dnn_dae/final.nnet \
+    exp/tri4b_dnn_mpe/final.feature_transform exp/tri4b_dnn_dae/dae.nnet  || exit 1
+  
+fi
+
+#decoding 
+if [ $stage -le 2 ]; then
+   for x in car white cafe; do
+     (
+       #decode word 
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe \
+         exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_mpe/decode_word_0db/$x || exit 1;
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \
+         exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_dae/decode_word_0db/$x || exit 1;
+
+       #decode phone
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe \
+         exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_mpe/decode_phone_0db/$x || exit 1;
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \
+         exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_dae/decode_phone_0db/$x || exit 1;
+    ) &
+   done
+fi
+
diff --git a/egs/thchs30/s5/local/dae/thchs30_util.pyx b/egs/thchs30/s5/local/dae/thchs30_util.pyx
new file mode 100755
index 00000000000..281ff166032
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/thchs30_util.pyx
@@ -0,0 +1,27 @@
+# Copyright 2016  Tsinghua University (Author: Chao Liu).  Apache 2.0.
+  
+def energy(list mat):
+    cdef float e
+    cdef int i, j, l
+    l = len(mat)
+    for i in range(l):
+        j = mat[i]
+        e += j * j
+    e /= l
+    return e
+
+def mix(list mat, list noise, int pos, double scale):
+    cdef len_noise, len_mat, i, x, y
+    ret = []
+    len_noise = len(noise)
+    len_mat = len(mat)
+    for i in range(len_mat):
+        x = mat[i]
+        y = int(x + scale * noise[pos])
+        if y > 32767:
+            y = 32767
+        elif y < -32768:
+            y = -32768
+        ret.append(y)
+        pos = (pos + 1) % len_noise
+    return pos, ret
diff --git a/egs/thchs30/s5/local/download_and_untar.sh b/egs/thchs30/s5/local/download_and_untar.sh
new file mode 100755
index 00000000000..655e674dc9b
--- /dev/null
+++ b/egs/thchs30/s5/local/download_and_untar.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey) 
+# Copyright   2016  Tsinghua University (author: Dong Wang)
+# Apache 2.0
+
+# Adapted from librispeech recipe local/download_and_untar.sh
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_thchs30, test-noise, resource"
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_thchs30 test-noise resource"
+for x in $list; do 
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+sizes="6453425169 1971460210 24813708"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  pwd
+  echo " wget --no-check-certificate $full_url"
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
diff --git a/egs/thchs30/s5/local/nnet/run_dnn.sh b/egs/thchs30/s5/local/nnet/run_dnn.sh
new file mode 100755
index 00000000000..d40f48e3609
--- /dev/null
+++ b/egs/thchs30/s5/local/nnet/run_dnn.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#run from ../..
+#DNN training, both xent and MPE
+
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+stage=0
+nj=8
+
+. utils/parse_options.sh || exit 1;
+
+gmmdir=$1
+alidir=$2
+alidir_cv=$3
+
+#generate fbanks
+if [ $stage -le 0 ]; then
+  echo "DNN training: stage 0: feature generation"
+  rm -rf data/fbank && mkdir -p data/fbank &&  cp -R data/{train,dev,test,test_phone} data/fbank || exit 1;
+  for x in train dev test; do
+    echo "producing fbank for $x"
+    #fbank generation
+    steps/make_fbank.sh --nj $nj --cmd "$train_cmd" data/fbank/$x exp/make_fbank/$x fbank/$x || exit 1
+    #ompute cmvn
+    steps/compute_cmvn_stats.sh data/fbank/$x exp/fbank_cmvn/$x fbank/$x || exit 1
+  done
+  
+  echo "producing test_fbank_phone"
+  cp data/fbank/test/feats.scp data/fbank/test_phone && cp data/fbank/test/cmvn.scp data/fbank/test_phone || exit 1;
+
+fi
+
+
+#xEnt training
+if [ $stage -le 1 ]; then
+  outdir=exp/tri4b_dnn
+  #NN training
+  (tail --pid=$$ -F $outdir/log/train_nnet.log 2>/dev/null)& # forward log
+  $cuda_cmd $outdir/log/train_nnet.log \
+    steps/nnet/train.sh --copy_feats false --cmvn-opts "--norm-means=true --norm-vars=false" --hid-layers 4 --hid-dim 1024 \
+	  --learn-rate 0.008 data/fbank/train data/fbank/dev data/lang $alidir $alidir_cv $outdir || exit 1;
+  #Decode (reuse HCLG graph in gmmdir)
+  (
+    steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \
+      $gmmdir/graph_word data/fbank/test $outdir/decode_test_word || exit 1; 
+  )&
+  (
+   steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \
+     $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone || exit 1; 
+  )&
+
+fi
+
+#MPE training
+
+srcdir=exp/tri4b_dnn
+acwt=0.1
+
+if [ $stage -le 2 ]; then
+  # generate lattices and alignments
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    data/fbank/train data/lang $srcdir ${srcdir}_ali || exit 1;
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    data/fbank/train data/lang $srcdir ${srcdir}_denlats || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  outdir=exp/tri4b_dnn_mpe
+  #Re-train the DNN by 3 iteration of MPE
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 3 --acwt $acwt --do-smbr false \
+    data/fbank/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $outdir || exit 1
+  #Decode (reuse HCLG graph)
+  for ITER in 3 2 1; do
+   (
+    steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \
+      $gmmdir/graph_word data/fbank/test $outdir/decode_test_word_it${ITER} || exit 1;
+   )&
+   (
+   steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \
+     $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone_it${ITER} || exit 1; 
+   )&
+  done
+fi
+
diff --git a/egs/thchs30/s5/local/score.sh b/egs/thchs30/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/thchs30/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/thchs30/s5/local/thchs-30_data_prep.sh b/egs/thchs30/s5/local/thchs-30_data_prep.sh
new file mode 100755
index 00000000000..7a85274ce83
--- /dev/null
+++ b/egs/thchs30/s5/local/thchs-30_data_prep.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#This script pepares the data directory for thchs30 recipe. 
+#It reads the corpus and get wav.scp and transcriptions.
+
+dir=$1
+corpus_dir=$2
+
+
+cd $dir
+
+echo "creating data/{train,dev,test}"
+mkdir -p data/{train,dev,test}
+
+#create wav.scp, utt2spk.scp, spk2utt.scp, text
+(
+for x in train dev test; do
+  echo "cleaning data/$x"
+  cd $dir/data/$x
+  rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text
+  echo "preparing scps and text in data/$x"
+  for nn in `find  $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
+      echo $nn $corpus_dir/$x/$nn.wav >> wav.scp
+      echo $nn $nn >> utt2spk
+      echo $nn $nn >> spk2utt
+      echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt
+      echo $nn `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt
+  done 
+  cp word.txt text
+done
+) || exit 1
+
+echo "creating test_phone for phone decoding"
+(
+  rm -rf data/test_phone && cp -R data/test data/test_phone  || exit 1
+  cd data/test_phone && rm text &&  cp phone.txt text || exit 1
+)
+
diff --git a/egs/thchs30/s5/local/thchs-30_decode.sh b/egs/thchs30/s5/local/thchs-30_decode.sh
new file mode 100755
index 00000000000..f9661f61f21
--- /dev/null
+++ b/egs/thchs30/s5/local/thchs-30_decode.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#decoding wrapper for thchs30 recipe
+#run from ../
+
+nj=8
+mono=false
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+. utils/parse_options.sh || exit 1;
+decoder=$1
+srcdir=$2
+datadir=$3
+
+
+if [ $mono = true ];then
+  echo  "using monophone to generate graph"
+  opt="--mono"
+fi
+
+#decode word
+utils/mkgraph.sh $opt data/graph/lang $srcdir $srcdir/graph_word  || exit 1;
+$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_word $datadir/test $srcdir/decode_test_word || exit 1
+
+#decode phone
+utils/mkgraph.sh $opt data/graph_phone/lang $srcdir $srcdir/graph_phone  || exit 1;
+$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_phone $datadir/test_phone $srcdir/decode_test_phone || exit 1
+
+
diff --git a/egs/thchs30/s5/local/wer_output_filter b/egs/thchs30/s5/local/wer_output_filter
new file mode 100755
index 00000000000..1ccb651a258
--- /dev/null
+++ b/egs/thchs30/s5/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+#Copyright 2016  Tsinghua University (Author: Dong Wang).  Apache 2.0.
+
+#This script accepts a Chinese stream and inserts blanks between Chinese characters
+#Used to prepare character-based transcriptions and compute CER.
+
+from __future__ import print_function
+import sys
+
+for l in sys.stdin:
+    l=l.strip()
+    ll=l.split()
+    lk=ll[0]
+    for v in ll[1:]:
+        v = v.decode('utf-8')
+        for i in v:
+           lk= lk + ' ' + i
+        
+    print (lk.encode('utf-8'))
diff --git a/egs/thchs30/s5/path.sh b/egs/thchs30/s5/path.sh
new file mode 100755
index 00000000000..bc199673fc5
--- /dev/null
+++ b/egs/thchs30/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+
+export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$PWD:$PATH
+
+export LC_ALL=C
+
diff --git a/egs/thchs30/s5/run.sh b/egs/thchs30/s5/run.sh
new file mode 100755
index 00000000000..24645f59e83
--- /dev/null
+++ b/egs/thchs30/s5/run.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+
+H=`pwd`  #exp home
+n=8      #parallel jobs
+
+#corpus and trans directory
+thchs=/nfs/public/materials/data/thchs30-openslr
+
+#you can obtain the database by uncommting the following lines
+#[ -d $thchs ] || mkdir -p $thchs  || exit 1
+#echo "downloading THCHS30 at $thchs ..."
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 data_thchs30  || exit 1
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 resource      || exit 1
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 test-noise    || exit 1
+
+#data preparation 
+#generate text, wav.scp, utt2pk, spk2utt
+local/thchs-30_data_prep.sh $H $thchs/data_thchs30 || exit 1;
+
+#produce MFCC features 
+rm -rf data/mfcc && mkdir -p data/mfcc &&  cp -R data/{train,dev,test,test_phone} data/mfcc || exit 1;
+for x in train dev test; do
+   #make  mfcc 
+   steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x exp/make_mfcc/$x mfcc/$x || exit 1;
+   #compute cmvn
+   steps/compute_cmvn_stats.sh data/mfcc/$x exp/mfcc_cmvn/$x mfcc/$x || exit 1;
+done
+#copy feats and cmvn to test.ph, avoid duplicated mfcc & cmvn 
+cp data/mfcc/test/feats.scp data/mfcc/test_phone && cp data/mfcc/test/cmvn.scp data/mfcc/test_phone || exit 1;
+
+
+#prepare language stuff
+#build a large lexicon that invovles words in both the training and decoding. 
+(
+  echo "make word graph ..."
+  cd $H; mkdir -p data/{dict,lang,graph} && \
+  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \
+  cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \
+  	grep -v '<s>' | grep -v '</s>' | sort -u > data/dict/lexicon.txt || exit 1;
+  utils/prepare_lang.sh --position_dependent_phones false data/dict "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
+  gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz || exit 1;
+  utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang || exit 1;
+)
+
+#make_phone_graph
+(
+  echo "make phone graph ..."
+  cd $H; mkdir -p data/{dict_phone,graph_phone,lang_phone} && \
+  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict_phone  && \
+  cat $thchs/data_thchs30/lm_phone/lexicon.txt | grep -v '<eps>' | sort -u > data/dict_phone/lexicon.txt  && \
+  echo "<SPOKEN_NOISE> sil " >> data/dict_phone/lexicon.txt  || exit 1;
+  utils/prepare_lang.sh --position_dependent_phones false data/dict_phone "<SPOKEN_NOISE>" data/local/lang_phone data/lang_phone || exit 1;
+  gzip -c $thchs/data_thchs30/lm_phone/phone.3gram.lm > data/graph_phone/phone.3gram.lm.gz  || exit 1;
+  utils/format_lm.sh data/lang_phone data/graph_phone/phone.3gram.lm.gz $thchs/data_thchs30/lm_phone/lexicon.txt \
+    data/graph_phone/lang  || exit 1;
+)
+
+#monophone
+steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1; 
+#test monophone model
+local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc &
+
+#monophone_ali
+steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1;
+
+#triphone
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1;
+#test tri1 model
+local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc &
+
+#triphone_ali
+steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+#lda_mllt
+steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+#test tri2b model
+local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc &
+
+
+#lda_mllt_ali
+steps/align_si.sh  --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+#sat
+steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+#test tri3b model
+local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc &
+
+#sat_ali
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+
+#quick
+steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1;
+#test tri4b model
+local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc &
+
+#quick_ali
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
+
+#quick_ali_cv
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/dev data/lang exp/tri4b exp/tri4b_ali_cv || exit 1;
+
+#train dnn model
+local/nnet/run_dnn.sh --stage 0 --nj $n  exp/tri4b exp/tri4b_ali exp/tri4b_ali_cv || exit 1;  
+
+#train dae model
+#python2.6 or above is required for noisy data generation.
+#To speed up the process, pyximport for python is recommeded.
+local/dae/run_dae.sh --stage 0  $thchs || exit 1;
diff --git a/egs/thchs30/s5/steps b/egs/thchs30/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/thchs30/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/thchs30/s5/utils b/egs/thchs30/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/thchs30/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh
index c8f0d9d67a7..71dd849a93b 100644
--- a/egs/tidigits/s5/cmd.sh
+++ b/egs/tidigits/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-#export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
index ff316514fc9..0bc08ab40a0 100755
--- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh
+++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
@@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
 
 cp $lang/L.fst $lang/L_disambig.fst
 
-silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'`
-cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
-   sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo
+num_sil_states=5
+num_nonsil_states=3
+silphonelist=`cat $lang/phones/silence.csl`
+nonsilphonelist=`cat $lang/phones/nonsilence.csl`
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo
 
 # Now we prepare a simple grammar G.fst that's a kind of loop of
 # digits (no silence in this, since that's handled in L.fst)
diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh
index fd91a53ff73..5abbfd4495a 100644
--- a/egs/timit/s5/cmd.sh
+++ b/egs/timit/s5/cmd.sh
@@ -1,36 +1,31 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-#export cuda_cmd=run.pl
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1
+# example scripts.
+export cuda_cmd="queue.pl --gpu 1"
 
-
-if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
-  export train_cmd="queue.pl -l arch=*64*"
-  export decode_cmd="queue.pl -l arch=*64* --mem 3G"
-  export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
-  export cuda_cmd="queue.pl -l gpu=1"
-elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then
+# the rest of this file is present for historical reasons.
+# for cluster-specific configuration it's better to rely on conf/queue.conf.
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   #b) BUT cluster options
-  queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*"
-  export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-  export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-  export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3"
-  export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" 
-else
-  echo "$0: you need to define options for your cluster."
-  exit 1;
+  queue="all.q@@blade,all.q@@speech"
+  gpu_queue="long.q@@gpu"
+  storage="matylda5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
 
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh
index 2d454050669..71dd849a93b 100644
--- a/egs/voxforge/s5/cmd.sh
+++ b/egs/voxforge/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_cz/s5/cmd.sh
+++ b/egs/vystadial_cz/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_en/s5/cmd.sh
+++ b/egs/vystadial_en/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh
index 00aa0c145a3..537c46ba4f2 100644
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@@ -1,30 +1,29 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-#b) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 2G"
+export mkgraph_cmd="queue.pl --mem 4G"
+# the use of cuda_cmd is deprecated.
+export cuda_cmd="queue.pl --gpu 1"
 
+# the rest of this file is present for historical reasons.
+# It's better to use conf/queue.conf for cluster-specific configuration.
 #c) BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
-
diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh
index cd64b654651..2d7ab51d900 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm.sh
@@ -46,7 +46,7 @@ frames_per_chunk=
 
 echo "$0 $@" # Print the command line for logging
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
diff --git a/egs/wsj/s5/local/run_kl_hmm.sh b/egs/wsj/s5/local/run_kl_hmm.sh
index 9e7679a7675..efe95052c1d 100644
--- a/egs/wsj/s5/local/run_kl_hmm.sh
+++ b/egs/wsj/s5/local/run_kl_hmm.sh
@@ -5,6 +5,8 @@
 
 . cmd.sh
 
+big_memory_cmd="$decode_cmd --mem 8G"
+
 states=20000
 dir=exp/tri4b_pretrain-dbn_dnn/
 
diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
index 911640f5495..3f620083e11 100755
--- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
+++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
@@ -228,7 +228,8 @@ sub SplitLongSegment {
                            $aligned_ctm->[$seg_end_index]->[2] -
                            $aligned_ctm->[$seg_start_index]->[1];
   my $current_seg_index = $seg_start_index;
-  while ($current_seg_length > 1.5 * $max_seg_length) {
+  my $aligned_ctm_size = keys($aligned_ctm);
+  while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size) {
     my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                     $seg_end_index, $max_seg_length);
     my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
@@ -322,7 +323,7 @@ sub ProcessWav {
   }
 
   # Save the aligned CTM if needed
-  if(tell($ACT) != -1){
+  if(defined($ACT)){
     for (my $i=0; $i<=$#aligned_ctm; $i++) {
       print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][1] ";
       print $ACT "$aligned_ctm[$i][2] $aligned_ctm[$i][3]\n";
@@ -458,4 +459,4 @@ sub InsertSilence {
 close(AI);
 close($SO);
 close($TO);
-close($ACT);
+close($ACT) if defined($ACT);
diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
index c768d89b44e..cdf1ff3e5df 100755
--- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
+++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
@@ -4,11 +4,12 @@
 
 # this script gets some stats that will help you debug the lexicon.
 
-# Begin configuration section.  
+# Begin configuration section.
 stage=1
 remove_stress=false
 nj=10  # number of jobs for various decoding-type things that we run.
 cmd=run.pl
+alidir=
 # End configuration section
 
 echo "$0 $@"  # Print the command line for logging
@@ -26,6 +27,8 @@ if [ $# != 5 ]; then
    echo "  --remove-stress <true|false>                     # if true, remove stress before printing analysis"
    echo "                                                   # note: if you change this, you only have to rerun"
    echo "                                                   # from stage 10."
+   echo "  --alidir <alignment-dir>                         # if supplied, training-data alignments and transforms"
+   echo "                                                   # are obtained from here instead of being generated."
    exit 1;
 fi
 
@@ -41,38 +44,46 @@ for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
 done
 
-if [ $stage -le 1 ]; then
-  steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src ${src}_ali_$(basename $data)
+if [ -z $alidir ]; then
+  alidir=${src}_ali_$(basename $data)
+  if [ $stage -le 1 ]; then
+    steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir
+  fi
 fi
 
+phone_lang=data/$(basename $lang)_phone_bg
+
 if [ $stage -le 2 ]; then
-  utils/make_phone_bigram_lang.sh $lang ${src}_ali_$(basename $data) data/$(basename $lang)_phone_bg
+  utils/make_phone_bigram_lang.sh $lang $alidir $phone_lang
 fi
 
 if [ $stage -le 3 ]; then
-  utils/mkgraph.sh data/$(basename $lang)_phone_bg $src $src/graph_phone_bg
+  utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg
 fi
 
 if [ $stage -le 4 ]; then
-  steps/decode_si.sh --cmd "$cmd" --nj $nj --transform-dir ${src}_ali_$(basename $data) \
-    --acwt 0.25 --beam 25.0 --lattice-beam 5.0 --max-active 2500 \
+  steps/decode_si.sh --skip-scoring true \
+    --cmd "$cmd" --nj $nj --transform-dir $alidir \
+    --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \
     $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg
 fi
 
 if [ $stage -le 5 ]; then
-  steps/get_train_ctm.sh $data $lang ${src}_ali_$(basename $data)
+  steps/get_train_ctm.sh --print-silence true --use-segments false \
+     --cmd "$cmd" $data $lang $alidir
 fi
 
 if [ $stage -le 6 ]; then
-  steps/get_ctm.sh --min-lmwt 3 --max-lmwt 8 \
-     $data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg
+  steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \
+     $data $phone_lang $src/decode_$(basename $data)_phone_bg
 fi
 
 if [ $stage -le 7 ]; then
   mkdir -p $dir
   # lmwt=4 corresponds to the scale we decoded at.
   cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm
-  cp ${src}_ali_$(basename $data)/ctm $dir/word.ctm
+
+  cp $alidir/ctm $dir/word.ctm
 fi
 
 if [ $stage -le 8 ]; then
@@ -82,7 +93,7 @@ if [ $stage -le 8 ]; then
 # we'll convert it into two entries like this, with the start and end separately:
 # sw02054-A 0021332 START and
 # sw02054-A 0021356 END and
-# 
+#
 # and suppose phone.ctm has lines like
 # sw02054 A 213.09 0.24 sil
 # sw02054 A 213.33 0.13 ae_B
@@ -95,18 +106,17 @@ if [ $stage -le 8 ]; then
 # then after sorting and merge-sorting the two ctm files we can easily
 # work out for each word, what the phones were during that time.
 
-  grep -v '<eps>' data/$(basename $lang)_phone_bg/phones.txt | awk '{print $1, $1}' | \
+  grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \
     sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt
 
-  silphone=$(cat data/$(basename $lang)_phone_bg/phones/optional_silence.txt)
-  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt | grep -v "$silphone\$" > $dir/phone_cleaned.ctm
+  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_text.ctm > $dir/phone_mapped.ctm
 
   export LC_ALL=C
-  
+
   cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \
      sort >$dir/word_processed.ctm
 
-  cat $dir/phone_cleaned.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \
+  cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \
      sort >$dir/phone_processed.ctm
 
   # merge-sort both ctm's
@@ -129,12 +139,16 @@ if [ $stage -le 10 ]; then
   else
     cp $srcdict $dir/lexicon.txt
   fi
+  silphone=$(cat $phone_lang/phones/optional_silence.txt)
+  echo "<eps> $silphone" >> $dir/lexicon.txt
 
   awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \
       <$dir/prons.txt >$dir/counts.txt
 
+
+
   cat $dir/prons.txt | \
-    if $remove_stress; then 
+    if $remove_stress; then
       perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } '
     else
       cat
@@ -143,9 +157,9 @@ if [ $stage -le 10 ]; then
      open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]";
      # create a hash of all reference pronuncations, and for each word, record
      # a list of the prons, separated by " | ".
-     while (<D>) { 
-        @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; 
-        $w = shift @A; 
+     while (<D>) {
+        @A = split(" ", $_); $is_pron{join(" ",@A)} = 1;
+        $w = shift @A;
         if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); }
         else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); }
      }
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
index 97fb62a9c4f..80a71b0edc5 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
@@ -5,9 +5,9 @@
 # Computes training alignments using a model with delta or
 # LDA+MLLT features.  This version, rather than just using the
 # text to align, computes mini-language models (unigram) from the text
-# and a few common words in the LM, and allows
+# and a few common words in the LM.
 
-# Begin configuration section.  
+# Begin configuration section.
 nj=4
 cmd=run.pl
 use_graphs=false
@@ -82,7 +82,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
-    cp $srcdir/final.mat $srcdir/full.mat $dir    
+    cp $srcdir/final.mat $srcdir/full.mat $dir
    ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -155,7 +155,7 @@ if [ $stage -le 2 ]; then
   # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
   # with the fields separated by tabs, e.g.
   # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
-  
+
   paste $dir/edits.txt \
       <(awk '{print $2}' $dir/length.txt) \
       <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
@@ -171,9 +171,9 @@ fi
 
 if [ $stage -le 3 ]; then
   ###
-  # These stats migh help people figure out what is wrong with the data
+  # These stats might help people figure out what is wrong with the data
   # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
-  # b)evaluation of per-speaker performance to possibly find speakers with 
+  # b)evaluation of per-speaker performance to possibly find speakers with
   #   distinctive accents/speech disorders and similar
   # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
   #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
new file mode 100755
index 00000000000..42c768f9a2d
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
+#           2016       Ilya Platonov      
+# Apache 2.0
+#
+# Tweaked version of find_bad_utts.sh to work with nnet2 baseline models. 
+#
+# Begin configuration section.  
+nj=32
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=15.0
+lattice_beam=8.0
+max_active=750
+transform_dir=  # directory to find fMLLR transforms in.
+top_n_words=100 # Number of common words that we compile into each graph (most frequent
+                # in $lang/text.
+stage=-1
+cleanup=true
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <dir>"
+   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_debug"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
+    $lang/L_disambig.fst $lang/phones/disambig.int; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+
+
+if [ $stage -le 0 ]; then
+  utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
+    awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
+    sort -rn > $dir/word_counts.int || exit 1;
+  num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
+  # print top-n words with their unigram probabilities.
+
+  head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
+  utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
+fi
+
+echo "$0: feature type is raw"
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";
+
+if [ $stage -le 1 ]; then
+  echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"
+
+  rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null
+
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
+    steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
+    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
+     $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
+    nnet-latgen-faster --acoustic-scale=$acoustic_scale --beam=$beam \
+      --max-active=$max_active --lattice-beam=$lattice_beam \
+      --word-symbol-table=$lang/words.txt \
+     $dir/final.mdl ark:- "$feats" ark:- \| \
+    lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
+      ark,t:- ark,t:$dir/edits.JOB.txt \| \
+    utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if [ -f $dir/edits.1.txt ]; then
+    # the awk commands below are to ensure that partially-written files don't confuse us.
+    for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt
+    for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt
+  else
+    echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
+  fi
+
+  # in case any utterances failed to align, get filtered copy of $data/text
+  utils/filter_scp.pl $dir/edits.txt < $data/text  > $dir/text
+  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
+
+  n1=$(wc -l < $dir/edits.txt)
+  n2=$(wc -l < $dir/aligned_ref.txt)
+  n3=$(wc -l < $dir/text)
+  n4=$(wc -l < $dir/length.txt)
+  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
+    echo "$0: mismatch in lengths of files:"
+    wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
+    exit 1;
+  fi
+
+  # note: the format of all_info.txt is:
+  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
+  # with the fields separated by tabs, e.g.
+  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
+  
+  paste $dir/edits.txt \
+      <(awk '{print $2}' $dir/length.txt) \
+      <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
+      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
+
+  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
+
+  if $cleanup; then
+    rm $dir/edits.*.txt $dir/aligned_ref.*.txt
+  fi
+
+fi
+
+if [ $stage -le 3 ]; then
+  ###
+  # These stats migh help people figure out what is wrong with the data
+  # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
+  # b)evaluation of per-speaker performance to possibly find speakers with 
+  #   distinctive accents/speech disorders and similar
+  # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
+  #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
+
+  mkdir -p $dir/analysis
+  align-text --special-symbol="***"  ark:$dir/text ark:$dir/aligned_ref.txt  ark,t:- | \
+    utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt
+
+  cat $dir/analysis/per_utt_details.txt | \
+    utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt
+
+  cat $dir/analysis/per_utt_details.txt | \
+    utils/scoring/wer_ops_details.pl --special-symbol "***" | \
+    sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
+
+fi
+
diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh
index 33be80d85b2..c1a22e274b8 100755
--- a/egs/wsj/s5/steps/conf/apply_calibration.sh
+++ b/egs/wsj/s5/steps/conf/apply_calibration.sh
@@ -76,7 +76,7 @@ fi
 # Create the forwarding data for logistic regression,
 if [ $stage -le 2 ]; then
   steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \
-    $dir/ctm_int $word_feats $latdepth $word_categories
+    --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories
 fi
 
 # Apply calibration model to dev,
diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
index 003d77c5e8a..23db9633a1c 100755
--- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py
+++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
@@ -13,8 +13,8 @@
 The logisitc-regression input features are: 
 - posteriors from 'ctm' transformed by logit,
 - logarithm of word-length in letters,
-- logarithm of average lattice-depth at position of the word,
 - 10base logarithm of unigram probability of a word from language model,
+- logarithm of average lattice-depth at position of the word (optional),
 
 The logistic-regresion targets are:
 - 1 for correct word,
@@ -33,12 +33,13 @@
 parser = OptionParser(usage=usage, description=desc)
 parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
 parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
+parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
 (o, args) = parser.parse_args()
 
-if len(args) != 4:
+if len(args) != 3:
   parser.print_help()
   sys.exit(1)
-ctm_file, word_feats_file, depths_file, word_categories_file = args
+ctm_file, word_feats_file, word_categories_file = args
 
 assert(o.conf_feats != '')
 
@@ -76,10 +77,12 @@
 
 # Load the per-frame lattice-depth,
 # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
-depths = dict()
-for l in open(depths_file):
-  utt,d = l.split(' ',1)
-  depths[utt] = map(int,d.split())
+# - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
+if o.lattice_depth:
+  depths = dict()
+  for l in open(o.lattice_depth):
+    utt,d = l.split(' ',1)
+    depths[utt] = map(int,d.split())
 
 # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
 wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
@@ -98,15 +101,19 @@
     logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
     # - log of word-length,
     log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
-    # - log of average-depth of lattice at the word position,
-    depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
-    log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
     # - categorical distribution of words (with frequency higher than min-count),
     wrd_1_of_k = [0]*wrd_cat_num; 
     wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;
 
     # Compose the input feature vector,
-    feats = [ logit, log_word_length, log_avg_depth, other_feats[wrd_id] ] + wrd_1_of_k
+    feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k
+
+    # Optionally add average-depth of lattice at the word position,
+    if o.lattice_depth != '':
+      depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
+      log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
+      feats += [ log_avg_depth ]
+
     # Store the input features, 
     f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n')
 
diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh
index 64ca70022c8..c2aca05056e 100755
--- a/egs/wsj/s5/steps/conf/train_calibration.sh
+++ b/egs/wsj/s5/steps/conf/train_calibration.sh
@@ -104,7 +104,7 @@ fi
 if [ $stage -le 3 ]; then
   steps/conf/prepare_calibration_data.py \
     --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \
-    $dir/ctm_aligned_int $word_feats $latdepth $dir/word_categories
+    --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories
 fi
 
 # Train the logistic regression,
diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh
index b0e2fed2017..f2bc1d367fd 100755
--- a/egs/wsj/s5/steps/decode.sh
+++ b/egs/wsj/s5/steps/decode.sh
@@ -3,8 +3,8 @@
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
-# Begin configuration section.  
-transform_dir=   # this option won't normally be used, but it can be used if you want to 
+# Begin configuration section.
+transform_dir=   # this option won't normally be used, but it can be used if you want to
                  # supply existing fMLLR transforms when decoding.
 iter=
 model= # You can specify the model to use (e.g. if you want to use the .alimdl)
@@ -64,16 +64,16 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 
 if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
-  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  if [ -z $iter ]; then model=$srcdir/final.mdl;
   else model=$srcdir/$iter.mdl; fi
 fi
 
 if [ $(basename $model) != final.alimdl ] ; then
   # Do not use the $srcpath -- look at the path where the model is
-  if [ -f $(dirname $model)/final.alimdl ] ; then
-    echo -e '\n\n' 
-    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' 
-    echo $0 'WARNING: This is OK if you know what you are doing...' 
+  if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then
+    echo -e '\n\n'
+    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!'
+    echo $0 'WARNING: This is OK if you know what you are doing...'
     echo -e '\n\n'
   fi
 fi
@@ -90,7 +90,7 @@ cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
@@ -129,7 +129,7 @@ fi
 if ! $skip_scoring ; then
   [ ! -x local/score.sh ] && \
     echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
-  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || 
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir ||
     { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
 fi
 
diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh
index 3d0ea576a57..2f2f6794e3d 100755
--- a/egs/wsj/s5/steps/get_ctm.sh
+++ b/egs/wsj/s5/steps/get_ctm.sh
@@ -8,6 +8,7 @@
 # begin configuration section.
 cmd=run.pl
 stage=0
+frame_shift=0.01
 min_lmwt=5
 max_lmwt=20
 use_segments=true # if we have a segments file, use it to convert
@@ -28,6 +29,8 @@ if [ $# -ne 3 ]; then
   echo "                                    # to produce a ctm relative to the original audio"
   echo "                                    # files, with channel information (typically needed"
   echo "                                    # for NIST scoring)."
+  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri4a/decode/"
   echo "See also: steps/get_train_ctm.sh"
@@ -55,7 +58,7 @@ if [ $stage -le 0 ]; then
     [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
     filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
   else
-    filter_cmd=cat    
+    filter_cmd=cat
   fi
 
   if [ -f $lang/phones/word_boundary.int ]; then
@@ -63,7 +66,7 @@ if [ $stage -le 0 ]; then
       set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
       lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
       $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
   else
@@ -76,7 +79,7 @@ if [ $stage -le 0 ]; then
       set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
       lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
       $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
   fi
diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh
index a6cbb2ac06a..10b29708d84 100755
--- a/egs/wsj/s5/steps/get_train_ctm.sh
+++ b/egs/wsj/s5/steps/get_train_ctm.sh
@@ -7,9 +7,12 @@
 
 # begin configuration section.
 cmd=run.pl
+frame_shift=0.01
 stage=0
 use_segments=true # if we have a segments file, use it to convert
                   # the segments to be relative to the original files.
+print_silence=false # if true, will print <eps> (optional-silence) arcs.
+
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -26,6 +29,8 @@ if [ $# -ne 3 ]; then
   echo "                                    # to produce a ctm relative to the original audio"
   echo "                                    # files, with channel information (typically needed"
   echo "                                    # for NIST scoring)."
+  echo "    --frame-shift (default=0.01)    # specify this if your alignments have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri3a_ali"
   echo "Produces ctm in: exp/tri3a_ali/ctm"
@@ -58,9 +63,9 @@ if [ $stage -le 0 ]; then
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
-      gzip -c '>' $dir/ctm.JOB.gz
+      gzip -c '>' $dir/ctm.JOB.gz || exit 1
   else
     if [ ! -f $lang/phones/align_lexicon.int ]; then
       echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
@@ -71,14 +76,14 @@ if [ $stage -le 0 ]; then
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
-      gzip -c '>' $dir/ctm.JOB.gz
+      gzip -c '>' $dir/ctm.JOB.gz || exit 1
   fi
 fi
 
 if [ $stage -le 1 ]; then
-  if [ -f $data/segments ]; then
+  if [ -f $data/segments ] && $use_segments; then
     f=$data/reco2file_and_channel
     [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
     for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done | \
diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
index 425fbc39f99..092bc53f5e8 100755
--- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh
+++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
@@ -9,6 +9,7 @@
 cmd=run.pl
 skip_scoring=false
 stage=1
+scoring_opts=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -57,7 +58,7 @@ fi
 if ! $skip_scoring && [ $stage -le 2 ]; then
   err_msg="Not scoring because local/score.sh does not exist or not executable."
   [ ! -x local/score.sh ] && echo $err_msg && exit 1;
-  local/score.sh --cmd "$cmd" $data $newlang $outdir
+  local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
 else
   echo "Not scoring because requested so..."
 fi
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index 1d152f6cf8d..09c34d40b24 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
@@ -81,7 +81,7 @@ if [ -f $data/segments ]; then
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
- 
+
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
@@ -127,8 +127,8 @@ done > $data/feats.scp
 
 rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   echo "consider using utils/fix_data_dir.sh $data"
diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh
index 8aceffccdaa..656acf2a815 100755
--- a/egs/wsj/s5/steps/nnet/train.sh
+++ b/egs/wsj/s5/steps/nnet/train.sh
@@ -145,14 +145,20 @@ else
   labels_tr_phn="ark:ali-to-phones --per-frame=true $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |"
 
   # get pdf-counts, used later for decoding/aligning,
-  analyze-counts --verbose=1 --binary=false "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1
+  analyze-counts --verbose=1 --binary=false \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1
   # copy the old transition model, will be needed by decoder,
   copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl || exit 1
   # copy the tree
   cp $alidir/tree $dir/tree || exit 1
 
   # make phone counts for analysis,
-  [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1
+  [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1
 fi
 
 ###### PREPARE FEATURES ######
@@ -365,7 +371,7 @@ if [ ! -z $nnet_init ]; then
 elif [ ! -z $nnet_proto ]; then
   echo "# initializing NN from prototype '$nnet_proto'";
   nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log
-  nnet-initialize --seed $seed $nnet_proto $nnet_init
+  nnet-initialize --seed=$seed $nnet_proto $nnet_init
 else 
   echo "# getting input/output dims :"
   # input-dim,
@@ -424,7 +430,7 @@ else
   # initialize,
   nnet_init=$dir/nnet.init
   echo "# initializing the NN '$nnet_proto' -> '$nnet_init'"
-  nnet-initialize $nnet_proto $nnet_init
+  nnet-initialize --seed=$seed $nnet_proto $nnet_init
 
   # optionally prepend dbn to the initialization,
   if [ ! -z "$dbn" ]; then
diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
index c840e014250..7bd4ecf5647 100755
--- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh
+++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
@@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do
   echo >> $dir/indexes
   num_blocks=$[$num_blocks+1]
   cur_index=$[$cur_index+$block_shift]
-  if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then
+  if [ $[$cur_index+$block_size] -gt $feat_dim ]; then
     cur_index=$[$feat_dim-$block_size];
   fi
 done
diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
deleted file mode 100755
index a960e2fcfe9..00000000000
--- a/egs/wsj/s5/steps/nnet2/get_num_frames.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# This script works out the approximate number of frames in a training directory
-# this is sometimes needed by higher-level scripts
-
-num_samples=1000
-
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-if [ $# -ne 1 ]; then
-  (
-    echo "Usage: $0 <data-dir>"
-    echo "Prints the number of frames of data in the data-dir, via sampling rather"
-    echo "than trying to access all the data."
-  ) 1>&2
-fi
-
-data=$1
-
-if [ ! -f $data/feats.scp ]; then
-  if [ -f $data/segments ]; then
-    echo "$0: $data/feats.scp does not exist, but $data/segments does exist; using that and assuming 100 frames per second." 1>&2
-    num_frames=$(cat $data/segments | awk '{x += $4 - $3;} END{print int(x*100);}') || exit 1;
-    echo $num_frames
-    exit 0;
-  else
-    echo "$0: neither $data/feats.scp nor $data/segments exist." 1>&2
-    exit 1;
-  fi
-fi
-
-
-sample_frames=$(utils/shuffle_list.pl $data/feats.scp | head -n $num_samples | sort | feat-to-len --print-args=false scp:-)
-
-num_files_orig=$(wc -l <$data/feats.scp)
-if [ $num_samples -lt $num_files_orig ]; then
-  num_files_sampled=$num_samples
-else
-  num_files_sampled=$num_files_orig
-fi
-
-perl -e "\$n = int(($sample_frames * 1.0 * $num_files_orig) / (1.0 * $num_files_sampled)); print \"\$n\n\";";
diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
new file mode 120000
index 00000000000..d5eab6ede07
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
@@ -0,0 +1 @@
+../../utils/data/get_num_frames.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index fc75932d0d3..35a5bac5313 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -19,13 +19,19 @@
 # Begin configuration section.
 cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
-frames_per_eg=25   # number of frames of labels per example.  more->less disk space and
-                  # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
+frames_per_eg=25   # number of feature frames example (not counting added context).
+                   # more->less disk space and less time preparing egs, but more
+                   # I/O during training.  note: the script may reduce this if
+                   # reduce_frames_per_eg is true.
 frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
-frame_subsampling_factor=3 # ratio between input and output frame-rate of nnet.
+cut_zero_frames=-1  # if activated, activates new-style derivative weights.. i'll reorganize
+                    # this if it works well.
+frame_subsampling_factor=3 # frames-per-second of features we train on divided
+                           # by frames-per-second at output of chain model
+alignment_subsampling_factor=3 # frames-per-second of input alignments divided
+                               # by frames-per-second at output of chain model
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
@@ -41,10 +47,13 @@ num_utts_subset=300     # number of utterances in validation and training
 num_valid_egs_combine=0  # #validation examples for combination weights at the very end.
 num_train_egs_combine=1000 # number of train examples for the above.
 num_egs_diagnostic=400 # number of frames for "compute_prob" jobs
-frames_per_iter=400000 # each iteration of training, see this many frames
-                       # per job.  This is just a guideline; it will pick a number
+frames_per_iter=400000 # each iteration of training, see this many frames per
+                       # job, measured at the sampling rate of the features
+                       # used.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
+
 right_tolerance=  #CTC right tolerance == max label delay.
+left_tolerance=
 
 transform_dir=     # If supplied, overrides latdir as the place to find fMLLR transforms
 
@@ -263,7 +272,7 @@ if [ $stage -le 2 ]; then
 fi
 
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames"
 
 
 [ -z $valid_left_context ] &&  valid_left_context=$left_context;
@@ -271,10 +280,12 @@ egs_opts="--left-context=$left_context --right-context=$right_context --num-fram
 # don't do the overlap thing for the validation data.
 valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
 
-ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$frame_subsampling_factor"
+ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
 [ ! -z $right_tolerance ] && \
   ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance"
 
+[ ! -z $left_tolerance ] && \
+  ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
@@ -326,7 +337,7 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  # create egs_orig.*.*.ark; the first index goes to $nj,
+  # create cegs_orig.*.*.ark; the first index goes to $nj,
   # the second to $num_archives_intermediate.
 
   egs_list=
@@ -379,7 +390,7 @@ if [ $stage -le 5 ]; then
       for y in $(seq $archives_multiple); do
         archive_index=$[($x-1)*$archives_multiple+$y]
         # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
-        ln -sf egs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
+        ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
       done
     done
     $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
@@ -394,6 +405,9 @@ if [ $stage -le 6 ]; then
   (
     cd $dir
     for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
+    # the next statement removes them if we weren't using the soft links to a
+    # 'storage' directory.
+    rm cegs_orig.*.ark 2>/dev/null
   )
   if [ $archives_multiple -gt 1 ]; then
     # there are some extra soft links that we should delete.
diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
new file mode 100644
index 00000000000..87961a0a8a6
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
@@ -0,0 +1,245 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+import subprocess
+import logging
+import math
+import re
+import time
+import imp
+import os
+
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def GetNumberOfLeaves(dir):
+    [stdout, stderr] = train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    parts = stdout.split()
+    #number of pdfs 7115
+    assert(' '.join(parts[0:3]) == "number of pdfs")
+    num_leaves = int(parts[3])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
+    train_lib.RunKaldiCommand("""
+  {command} {dir}/log/make_phone_lm.log \
+    chain-est-phone-lm {lm_opts} \
+     "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
+     {dir}/phone_lm.fst
+    """.format(command = run_opts.command,
+               dir = dir,
+               lm_opts = lm_opts if lm_opts is not None else '',
+               tree_dir = tree_dir))
+
+def CreateDenominatorFst(dir, tree_dir, run_opts):
+    train_lib.RunKaldiCommand("""
+    copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
+    {command} {dir}/log/make_den_fst.log \
+    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \
+        {dir}/den.fst {dir}/normalization.fst""".format(
+            tree_dir = tree_dir, dir = dir, command = run_opts.command))
+
+def GenerateChainEgs(dir, data, lat_dir, egs_dir,
+                    left_context, right_context,
+                    run_opts, stage = 0,
+                    valid_left_context = None, valid_right_context = None,
+                    left_tolerance = None, right_tolerance = None,
+                    frame_subsampling_factor = 3,
+                    alignment_subsampling_factor = 3,
+                    feat_type = 'raw', online_ivector_dir = None,
+                    frames_per_iter = 20000, frames_per_eg = 20,
+                    egs_opts = None, cmvn_opts = None, transform_dir = None):
+
+    train_lib.RunKaldiCommand("""
+steps/nnet3/chain/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context '{valid_left_context}' \
+  --valid-right-context '{valid_right_context}' \
+  --left-tolerance '{left_tolerance}' \
+  --right-tolerance '{right_tolerance}' \
+  --frame-subsampling-factor {frame_subsampling_factor} \
+  --alignment-subsampling-factor {alignment_subsampling_factor} \
+  --stage {stage} \
+  --frames-per-iter {frames_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  {data} {dir} {lat_dir} {egs_dir}
+      """.format(command = run_opts.command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context if valid_left_context is not None else '',
+          valid_right_context = valid_right_context if valid_right_context is not None else '',
+          left_tolerance = left_tolerance if left_tolerance is not None else '',
+          right_tolerance = right_tolerance if right_tolerance is not None else '',
+          frame_subsampling_factor = frame_subsampling_factor,
+          alignment_subsampling_factor = alignment_subsampling_factor,
+          stage = stage, frames_per_iter = frames_per_iter,
+          frames_per_eg = frames_per_eg,
+          data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+# this function is exactly similar to the version in nnet3_train_lib.py
+# except it uses egs files in place of cegs files
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs = None, rand_prune = 4.0,
+                                 lda_opts = None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+
+  # Write stats with the same format as stats for LDA.
+    train_lib.RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+ nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command = run_opts.command,
+        num_lda_jobs = num_lda_jobs,
+        dir = dir,
+        egs_dir = egs_dir,
+        rand_prune = rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command = run_opts.command,
+        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+ nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+     """.format(command = run_opts.command,dir = dir,
+                lda_opts = lda_opts if lda_opts is not None else ""))
+
+    train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+def PrepareInitialAcousticModel(dir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
+               dir = dir))
+
+    # The model-format for a 'chain' acoustic model is just the transition
+    # model and then the raw nnet, so we can use 'cat' to create this, as
+    # long as they have the same mode (binary or not binary).
+    # We ensure that they have the same mode (even if someone changed the
+    # script to make one or both of them text mode) by copying them both
+    # before concatenating them.
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format(
+                   command = run_opts.command, dir = dir))
+
+def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
+                  egs_dir, leaky_hmm_coefficient, l2_regularize,
+                  xent_regularize, run_opts):
+    # Now do combination.  In the nnet3 setup, the logic
+    # for doing averaging of subsets of the models in the case where
+    # there are too many models to reliably esetimate interpolation
+    # factors (max_models_combine) is moved into the nnet3-combine
+    raw_model_strings = []
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      model_file = '{0}/{1}.mdl'.format(dir, iter)
+      if not os.path.exists(model_file):
+          raise Exception('Model file {0} missing'.format(model_file))
+      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+    train_lib.RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-chain-combine --num-iters=40 \
+   --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {dir}/den.fst {raw_models} "ark:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
+"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
+    """.format(command = run_opts.command,
+               combine_queue_opt = run_opts.combine_queue_opt,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               dir = dir, raw_models = " ".join(raw_model_strings),
+               num_chunk_per_minibatch = num_chunk_per_minibatch,
+               num_iters = num_iters,
+               egs_dir = egs_dir))
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False)
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
+                                leaky_hmm_coefficient, run_opts, wait = False):
+
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+  nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  --xent-regularize={xent_reg} \
+  "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+        "ark:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir, iter = iter, model = model,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               xent_reg = xent_regularize,
+               egs_dir = egs_dir), wait = wait)
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+  nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  --xent-regularize={xent_reg} \
+  "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+        "ark:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               xent_reg = xent_regularize,
+               egs_dir = egs_dir), wait = wait)
+
+def ComputeProgress(dir, iter, run_opts, wait=False):
+
+    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+nnet3-am-info {model} '&&' \
+nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               prev_model = prev_model), wait = wait)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
new file mode 100755
index 00000000000..08746d523ee
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+import os
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+import shutil
+import math
+
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting chain model trainer (train.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains RNN and DNN acoustic models using the 'chain' objective function.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="directory with the ivectors extracted in an online fashion.")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 150,
+                        help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 0,
+                        help="Number of additional frames of input to the left"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of RNN state before prediction of"
+                        " the first label. In the case of FF-DNN this extra"
+                        " context will be used to allow for frame-shifts")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="Number of additional frames of input to the right"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of bidirectional RNN state before"
+                        " prediction of the first label.")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="Directory with egs. If specified this directory "
+                        "will be used rather than extracting egs")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = -6, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
+
+    # chain options
+    parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="options to be be passed to chain-est-phone-lm")
+    parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize',
+                        default = 0.0,
+                        help="Weight of regularization function which is the"
+                        " l2-norm of the output of the network. It should be"
+                        " used without the log-softmax layer for the outputs."
+                        " As l2-norm of the log-softmax outputs can dominate"
+                        " the objective function.")
+    parser.add_argument("--chain.xent-regularize", type=float, dest='xent_regularize',
+                        default = 0.0,
+                        help="Weight of regularization function which is the"
+                        " cross-entropy cost the outputs.")
+    parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance',
+                        default = 5, help="")
+    parser.add_argument("--chain.left-tolerance", type=int, dest='left_tolerance',
+                        default = 5, help="")
+    parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient',
+                        default = 0.00001, help="")
+    parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights',
+                        default=True, action=train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="")
+    parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights',
+                        default =0,
+                        help="Can be used to set to zero the weights of derivs"
+                        " from frames near the edges.  (counts subsampled frames)")
+    parser.add_argument("--chain.frame-subsampling-factor", type=int,
+                        dest='frame_subsampling_factor',
+                        default = 3,
+                        help="ratio of frames-per-second of features we train"
+                        " on, to chain model's output")
+    parser.add_argument("--chain.alignment-subsampling-factor", type=int,
+                        dest='alignment_subsampling_factor',
+                        default = 3,
+                        help="ratio of frames-per-second of input alignments to"
+                        " chain model's output")
+    parser.add_argument("--chain.ngram-order", type=int, dest='ngram_order',
+                        default = 3, help="")
+    parser.add_argument("--chain.left-deriv-truncate", type=int,
+                        dest='left_deriv_truncate',
+                        default = None, help="")
+    parser.add_argument("--chain.right-deriv-truncate", type=int,
+                        dest='right_deriv_truncate',
+                        default = None, help="")
+
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 10,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final"
+                        " model combination stage. These models will themselves"
+                        " be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help="Controls randomization of the samples on each"
+                        " iteration. If 0 or a large value the randomization is"
+                        " complete, but this will consume memory and cause spikes"
+                        " in disk I/O.  Smaller is easier on disk and memory but"
+                        " less random.  It's not a huge deal though, as samples"
+                        " are anyway randomized right at the start. (the point"
+                        " of this is to get data in different minibatches on"
+                        " different iterations, since in the preconditioning"
+                        " method, 2 samples in the same minibatch can affect"
+                        " each others' gradients.")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        " during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="The maximum change in parameters allowed per"
+                        " minibatch, measured in Frobenius norm over the entire model")
+    parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter',
+                        default=800000,
+                        help ="Each iteration of training, see this many [input]"
+                        " frames per job.  This option is passed to get_egs.sh."
+                        " Aim for about a minute of training time")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="Value used in preconditioning matrix estimation")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="Max number of jobs used for LDA stats accumulation")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0002,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00002,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at"
+                        " the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = "The is the maximum number of models we give to"
+                        " the final 'combine' stage, but these models will"
+                        " themselves be averages of iteration-number ranges.")
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="Momentum used in update computation."
+                        " Note: we implemented it in such a way that it doesn't"
+                        " increase the effective learning rate.")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 1.0,
+                        help="Scaling factor used for scaling the parameter"
+                        " matrices when the derivative averages are below the"
+                        " shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this"
+                        " threshold we scale the parameter matrices with the"
+                        " shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+    parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str, dest='shrink_nonlinearity',
+                        default = "SigmoidComponent", choices = ["TanhComponent", "SigmoidComponent"],
+                        help="The non-linear component from which the"
+                        " deriv-avg values are going to used to compute"
+                        " mean-deriv-avg. The mean-deriv-avg is going to be"
+                        " compared with shrink-threshold. Be careful to specify"
+                        " a shrink-threshold which is dependent on the"
+                        " shrink-nonlinearity type")
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=512,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = train_lib.NullstrToNoneAction, dest="command",
+                        help="Specifies the script to launch jobs."
+                        " e.g. queue.pl for launching on SGE cluster run.pl"
+                        " for launching on local machine", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, remove egs after experiment")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = train_lib.NullstrToNoneAction,
+                        help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ")
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--tree-dir", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--lat-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+    print(sys.argv)
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be non-negative")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be non-negative")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.lat_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not train_lib.CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+
+    run_opts.command = args.command
+
+    return [args, run_opts]
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.parallel_train_opts = None
+
+
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    deriv_time_opts=""
+    if left_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
+    if right_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        if job == 1:
+            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-chain-train {parallel_train_opts} \
+  --apply-deriv-weights={app_deriv_wts} \
+  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+   "{raw_model}" {dir}/den.fst \
+  "ark:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     deriv_time_opts = deriv_time_opts,
+                     trunc_deriv = truncate_deriv_weights,
+                     app_deriv_wts = apply_deriv_weights,
+                     fr_shft = frame_shift, l2 = l2_regularize,
+                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     cache_io_opts = cur_cache_io_opts,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      frame_subsampling_factor, truncate_deriv_weights,
+                      run_opts):
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
+            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+
+    if iter > 0:
+        chain_lib.ComputeProgress(dir, iter, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file)
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
+
+    if do_average:
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+      cur_max_param_change = max_param_change
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, cur_max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts)
+
+    [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+
+def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
+    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Check files
+    CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir)
+
+    # Set some variables.
+    num_jobs = train_lib.GetNumberOfJobs(args.tree_dir)
+    feat_dim = train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = train_lib.GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    train_lib.SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [model_left_context, model_right_context, num_hidden_layers] = train_lib.ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+    if (args.stage <= -6):
+        logger.info("Creating phone language-model")
+        chain_lib.CreatePhoneLm(args.dir, args.tree_dir, run_opts, lm_opts = args.lm_opts)
+
+    if (args.stage <= -5):
+        logger.info("Creating denominator FST")
+        chain_lib.CreateDenominatorFst(args.dir, args.tree_dir, run_opts)
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+        # this is where get_egs.sh is called.
+        chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
+                                    left_context + args.frame_subsampling_factor/2,
+                                    right_context + args.frame_subsampling_factor/2,
+                                    run_opts,
+                                    left_tolerance = args.left_tolerance,
+                                    right_tolerance = args.right_tolerance,
+                                    frame_subsampling_factor = args.frame_subsampling_factor,
+                                    alignment_subsampling_factor = args.alignment_subsampling_factor,
+                                    frames_per_eg = args.chunk_width,
+                                    egs_opts = args.egs_opts,
+                                    cmvn_opts = args.cmvn_opts,
+                                    online_ivector_dir = args.online_ivector_dir,
+                                    frames_per_iter = args.frames_per_iter,
+                                    transform_dir = args.transform_dir,
+                                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+    num_archives_expanded = num_archives * args.frame_subsampling_factor
+
+    if (args.num_jobs_final > num_archives_expanded):
+        raise Exception('num_jobs_final cannot exceed the expanded number of archives')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        chain_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        chain_lib.PrepareInitialAcousticModel(args.dir, run_opts)
+
+    file_handle = open("{0}/frame_subsampling_factor".format(args.dir),"w")
+    file_handle.write(str(args.frame_subsampling_factor))
+    file_handle.close()
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = train_lib.VerifyIterations(num_iters, args.num_epochs,
+                                                   num_hidden_layers, num_archives_expanded,
+                                                   args.max_models_combine, args.add_layers_period,
+                                                   args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: train_lib.GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                                           num_archives_processed,
+                                                                                           num_archives_to_process,
+                                                                                           args.initial_effective_lrate,
+                                                                                           args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if args.shrink_value != 1.0:
+                model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+                shrinkage_value = args.shrink_value if train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1
+            else:
+                shrinkage_value = args.shrink_value
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value,
+                              args.num_chunk_per_minibatch,
+                              num_hidden_layers, args.add_layers_period,
+                              args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate,
+                              args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size,
+                              args.frame_subsampling_factor,
+                              args.truncate_deriv_weights, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability")
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    train_lib.SendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
+                args.num_chunk_per_minibatch, egs_dir,
+                args.leaky_hmm_coefficient, args.l2_regularize,
+                args.xent_regularize, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                               preserve_model_interval = args.preserve_model_interval,
+                               remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability")
+    if args.email is not None:
+        train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 1a62d8d7bb6..d89e9a335dc 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -23,21 +23,33 @@ truncate_deriv_weights=0  # can be used to set to zero the weights of derivs fro
 apply_deriv_weights=true
 initial_effective_lrate=0.0002
 final_effective_lrate=0.00002
+extra_left_context=0  # actually for recurrent setups.
 pnorm_input_dim=3000
 pnorm_output_dim=300
 relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+
+jesus_opts=  # opts to steps/nnet3/make_jesus_configs.py.
+             # If nonempty, assumes you want to use the jesus nonlinearity,
+             # and you should supply various options to that script in
+             # this string.
 rand_prune=4.0 # Relates to a speedup we do for LDA.
 minibatch_size=512  # This default is suitable for GPU-based training.
                     # Set it to 128 for multi-threaded CPU-based training.
 lm_opts=   # options to chain-est-phone-lm
+l2_regularize=0.0
+leaky_hmm_coefficient=0.00001
+xent_regularize=0.0
 frames_per_iter=800000  # each iteration of training, see this many [input]
                         # frames per job.  This option is passed to get_egs.sh.
                         # Aim for about a minute of training time
-right_tolerance=10
-denominator_scale=1.0 # relates to tombsone stuff.
+right_tolerance=5  # tolerance at the same frame-rate as the alignment directory.
+left_tolerance=5    # tolerance at the same frame-rate as the alignment directory.
 num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
-frame_subsampling_factor=3  # controls reduced frame-rate at the output.
+frame_subsampling_factor=3  # ratio of frames-per-second of features we train
+                            # on, to chain model's output
+alignment_subsampling_factor=3  # ratio of frames-per-second of input alignments
+                                # to chain model's output
 get_egs_stage=0    # can be used for rerunning after partial
 online_ivector_dir=
 max_param_change=2.0
@@ -66,6 +78,10 @@ exit_stage=-100 # you can set this to terminate the training early.  Exits befor
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
 splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+pool_type='none'
+pool_window=
+pool_lpfilter_width=
+
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
@@ -87,7 +103,7 @@ right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on th
 
 # End configuration section.
 
-trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -197,23 +213,44 @@ num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exi
 
 if [ $stage -le -5 ]; then
   echo "$0: creating neural net configs";
-  if [ ! -z "$relu_dim" ]; then
-    dim_opts="--relu-dim $relu_dim"
+
+  if [ ! -z "$jesus_opts" ]; then
+    $cmd $dir/log/make_configs.log \
+       python steps/nnet3/make_jesus_configs.py \
+      --xent-regularize=$xent_regularize \
+      --include-log-softmax=false \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+       $jesus_opts \
+      --num-targets $num_leaves \
+      $dir/configs || exit 1;
   else
-    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
-  fi
+    [ $xent_regularize != "0.0" ] && \
+      echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1;
+    if [ ! -z "$relu_dim" ]; then
+      dim_opts="--relu-dim $relu_dim"
+    else
+      dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+    fi
 
-  # create the config files for nnet initialization
-  python steps/nnet3/make_tdnn_configs.py \
-    --include-log-softmax=false \
-    --final-layer-normalize-target $final_layer_normalize_target \
-    --splice-indexes "$splice_indexes"  \
-    --feat-dim $feat_dim \
-    --ivector-dim $ivector_dim  \
-     $dim_opts \
-    --num-targets $num_leaves \
-    --use-presoftmax-prior-scale false \
-   $dir/configs || exit 1;
+    # create the config files for nnet initialization
+    pool_opts=
+    pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+    pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+    pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+
+    python steps/nnet3/tdnn/make_configs.py $pool_opts \
+      --include-log-softmax=false \
+      --final-layer-normalize-target $final_layer_normalize_target \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+      $dim_opts \
+      --num-targets $num_leaves \
+      --use-presoftmax-prior-scale false \
+      $dir/configs || exit 1;
+  fi
 
   # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
   # matrix.  This first config just does any initial splicing that we do;
@@ -229,6 +266,12 @@ fi
 # num_hidden_layers=(something)
 . $dir/configs/vars || exit 1;
 
+# the next 2 lines are in case the configs were created by an older
+# config-generating script, which writes to left_context and right_context
+# instead of model_left_context and model_right_context.
+[ -z $model_left_context ] && model_left_context=$left_context
+[ -z $model_right_context ] && model_right_context=$right_context
+
 ! [ "$num_hidden_layers" -gt 0 ] && echo \
  "$0: Expected num_hidden_layers to be defined" && exit 1;
 
@@ -242,14 +285,17 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--transform-dir $transform_dir)
   # we need a bit of extra left-context and right-context to allow for frame
   # shifts (we use shifted version of the data for more variety).
-  extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2])
-  extra_opts+=(--right-context $[$right_context+$frame_subsampling_factor/2])
+  extra_opts+=(--left-context $[$model_left_context+$frame_subsampling_factor/2+$extra_left_context])
+  extra_opts+=(--right-context $[$model_right_context+$frame_subsampling_factor/2])
   echo "$0: calling get_egs.sh"
   steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \
       --frames-per-iter $frames_per_iter --stage $get_egs_stage \
       --cmd "$cmd" \
+      --right-tolerance "$right_tolerance" \
+      --left-tolerance "$left_tolerance" \
       --frames-per-eg $frames_per_eg \
       --frame-subsampling-factor $frame_subsampling_factor \
+      --alignment-subsampling-factor $alignment_subsampling_factor \
       $data $dir $latdir $dir/egs || exit 1;
 fi
 
@@ -271,8 +317,8 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
 # the --egs-dir option was used on the command line).
 egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
-( [ $egs_left_context -lt $left_context ] || \
-  [ $egs_right_context -lt $right_context ] ) && \
+( [ $egs_left_context -lt $model_left_context ] || \
+  [ $egs_right_context -lt $model_right_context ] ) && \
    echo "$0: egs in $egs_dir have too little context" && exit -1;
 
 frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
@@ -414,11 +460,11 @@ while [ $x -lt $num_iters ]; do
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
-      nnet3-chain-compute-prob  \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
           "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
           "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
     $cmd $dir/log/compute_prob_train.$x.log \
-      nnet3-chain-compute-prob \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient  --xent-regularize=$xent_regularize \
           "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
           "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
 
@@ -440,10 +486,12 @@ while [ $x -lt $num_iters ]; do
       cur_num_hidden_layers=$[1+$x/$add_layers_period]
       config=$dir/configs/layer$cur_num_hidden_layers.config
       mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+      cache_io_opts=""
     else
       do_average=true
       if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
       mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+      cache_io_opts="--read-cache=$dir/cache.$x"
     fi
     if $do_average; then
       this_minibatch_size=$minibatch_size
@@ -461,7 +509,9 @@ while [ $x -lt $num_iters ]; do
     rm $dir/.error 2>/dev/null
 
 
-    ( # this sub-shell is so that when we "wait" below,
+    (
+      trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
+      # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
 
@@ -473,10 +523,16 @@ while [ $x -lt $num_iters ]; do
                                                # the other indexes from.
         archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
         frame_shift=$[($k/$num_archives)%$frame_subsampling_factor];
-
+        if [ $n -eq 1 ]; then
+          # opts for computation cache (storing compiled computation).
+          this_cache_io_opts="$cache_io_opts --write-cache=$dir/cache.$[$x+1]"
+        else
+          this_cache_io_opts="$cache_io_opts"
+        fi
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \
-              $parallel_train_opts $deriv_time_opts \
+             --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+              $this_cache_io_opts $parallel_train_opts $deriv_time_opts \
              --max-param-change=$this_max_param_change \
             --print-interval=10 "$mdl" $dir/den.fst \
           "ark:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
@@ -518,6 +574,7 @@ while [ $x -lt $num_iters ]; do
       rm $dir/$[$x-1].mdl
     fi
   fi
+  rm $dir/cache.$x 2>/dev/null
   x=$[$x+1]
   num_archives_processed=$[$num_archives_processed+$this_num_jobs]
 done
@@ -543,7 +600,7 @@ if [ $stage -le $num_iters ]; then
   # num-threads to 8 to speed it up (this isn't ideal...)
 
   $cmd $combine_queue_opt $dir/log/combine.log \
-    nnet3-chain-combine --num-iters=40 \
+    nnet3-chain-combine --num-iters=40  --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \
        --enforce-sum-to-one=true --enforce-positive-weights=true \
        --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \
        "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1;
@@ -553,11 +610,11 @@ if [ $stage -le $num_iters ]; then
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
   $cmd $dir/log/compute_prob_valid.final.log \
-    nnet3-chain-compute-prob \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
            "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
     "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
   $cmd $dir/log/compute_prob_train.final.log \
-    nnet3-chain-compute-prob \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
       "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
     "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
 fi
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 87323a1c3e1..e14ab40519f 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -6,6 +6,24 @@
 import sys
 import warnings
 import copy
+from operator import itemgetter
+
+def GetSumDescriptor(inputs):
+    sum_descriptors = inputs
+    while len(sum_descriptors) != 1:
+        cur_sum_descriptors = []
+        pair = []
+        while len(sum_descriptors) > 0:
+            value = sum_descriptors.pop()
+            if value.strip() != '':
+                pair.append(value)
+            if len(pair) == 2:
+                cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1]))
+                pair = []
+        if pair:
+            cur_sum_descriptors.append(pair[0])
+        sum_descriptors = cur_sum_descriptors
+    return sum_descriptors
 
 # adds the input nodes and returns the descriptor
 def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
@@ -19,11 +37,24 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
         components.append('input-node name=ivector dim=' + str(ivector_dim))
         list.append('ReplaceIndex(ivector, t, 0)')
         output_dim += ivector_dim
-    splice_descriptor = "Append({0})".format(", ".join(list))
+    if len(list) > 1:
+        splice_descriptor = "Append({0})".format(", ".join(list))
+    else:
+        splice_descriptor = list[0]
     print(splice_descriptor)
     return {'descriptor': splice_descriptor,
             'dimension': output_dim}
 
+def AddNoOpLayer(config_lines, name, input):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
+    component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_noop'.format(name),
+            'dimension': input['dimension']}
+
 def AddLdaLayer(config_lines, name, input, lda_file):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -34,6 +65,27 @@ def AddLdaLayer(config_lines, name, input, lda_file):
     return {'descriptor':  '{0}_lda'.format(name),
             'dimension': input['dimension']}
 
+def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    assert((input['dimension'] % num_blocks == 0) and
+            (output_dim % num_blocks == 0))
+    components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
+    component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor' : '{0}_block_affine'.format(name),
+                           'dimension' : output_dim}
+
+def AddPermuteLayer(config_lines, name, input, column_map):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    permute_indexes = ",".join(map(lambda x: str(x), column_map))
+    components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
+    component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor': '{0}_permute'.format(name),
+            'dimension': input['dimension']}
+
 def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -44,13 +96,14 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""
     return {'descriptor':  '{0}_affine'.format(name),
             'dimension': output_dim}
 
-def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
+def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
 
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options))
-    components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim))
-    components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim))
+    components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string))
+    components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))
 
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
     component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
@@ -59,7 +112,34 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
     return {'descriptor':  '{0}_renorm'.format(name),
             'dimension': output_dim}
 
+def AddConvolutionLayer(config_lines, name, input,
+                       input_x_dim, input_y_dim, input_z_dim,
+                       filt_x_dim, filt_y_dim,
+                       filt_x_step, filt_y_step,
+                       num_filters, input_vectorization,
+                       param_stddev = None, bias_stddev = None,
+                       filter_bias_file = None,
+                       is_updatable = True):
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization)
+    if filter_bias_file is not None:
+        conv_init_string += " matrix={0}".format(filter_bias_file)
+    if is_updatable:
+        conv_init_string += " is-updatable=true"
+    else:
+        conv_init_string += " is-updatable=false"
+
+    components.append(conv_init_string)
+    component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
 
+    num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step)
+    num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step)
+    output_dim = num_x_steps * num_y_steps * num_filters;
+    return {'descriptor':  '{0}_conv_t'.format(name),
+            'dimension': output_dim}
 
 def AddSoftmaxLayer(config_lines, name, input):
     components = config_lines['components']
@@ -72,152 +152,83 @@ def AddSoftmaxLayer(config_lines, name, input):
             'dimension': input['dimension']}
 
 
-def AddOutputNode(config_lines, input, label_delay=None):
+def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
-    if label_delay is None:
-        component_nodes.append('output-node name=output input={0}'.format(input['descriptor']))
-    else:
-        component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay))
-
-def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"):
-    prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options)
-    if include_softmax == "true":
-      prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
-    AddOutputNode(config_lines, prev_layer_output, label_delay)
 
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
+    components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string))
+    component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor']))
+    return {'descriptor':  '{0}_sigmoid'.format(name),
+            'dimension': input['dimension']}
 
-def AddLstmLayer(config_lines,
-                 name, input, cell_dim,
-                 recurrent_projection_dim = 0,
-                 non_recurrent_projection_dim = 0,
-                 clipping_threshold = 1.0,
-                 norm_based_clipping = "false",
-                 ng_per_element_scale_options = "",
-                 ng_affine_options = "",
-                 lstm_delay = -1):
-    assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
+def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
+    name = 'output'
+    if suffix is not None:
+        name = '{0}-{1}'.format(name, suffix)
 
-    input_descriptor = input['descriptor']
-    input_dim = input['dimension']
-    name = name.strip()
-
-    if (recurrent_projection_dim == 0):
-        add_recurrent_projection = False
-        recurrent_projection_dim = cell_dim
-        recurrent_connection = "m_t"
-    else:
-        add_recurrent_projection = True
-        recurrent_connection = "r_t"
-    if (non_recurrent_projection_dim == 0):
-        add_non_recurrent_projection = False
+    if label_delay is None:
+        component_nodes.append('output-node name={0} input={1} objective={2}'.format(name, input['descriptor'], objective_type))
     else:
-        add_non_recurrent_projection = True
-
-    # Natural gradient per element scale parameters
-    ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
-    # Parameter Definitions W*(* replaced by - to have valid names)
-    components.append("# Input gate control : W_i* matrices")
-    components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
-
-    components.append("# Forget gate control : W_f* matrices")
-    components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
-
-    components.append("#  Output gate control : W_o* matrices")
-    components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
-
-    components.append("# Cell input matrices : W_c* matrices")
-    components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
-
-
-    components.append("# Defining the non-linearities")
-    components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim))
-
-    components.append("# Defining the cell computations")
-    components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping))
-
-    # c1_t and c2_t defined below
-    component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name))
-    c_tminus1_descriptor = "IfDefined(Offset({0}_c_t, {1}))".format(name, lstm_delay)
-
-    component_nodes.append("# i_t")
-    component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-    component_nodes.append("component-node name={0}_i2 component={0}_w_ic  input={1}".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name))
-
-    component_nodes.append("# f_t")
-    component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-    component_nodes.append("component-node name={0}_f2 component={0}_w_fc  input={1}".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name))
-
-    component_nodes.append("# o_t")
-    component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-    component_nodes.append("component-node name={0}_o2 component={0}_w_oc input={0}_c_t".format(name))
-    component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name))
-
-    component_nodes.append("# h_t")
-    component_nodes.append("component-node name={0}_h_t component={0}_h input={0}_c_t".format(name))
-
-    component_nodes.append("# g_t")
-    component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-    component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name))
-
-    component_nodes.append("# parts of c_t")
-    component_nodes.append("component-node name={0}_c1_t component={0}_c1  input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name))
+        component_nodes.append('output-node name={0} input=Offset({1},{2}) objective={3}'.format(name, input['descriptor'], label_delay, objective_type))
+
+def AddFinalLayer(config_lines, input, output_dim,
+        ng_affine_options = " param-stddev=0 bias-stddev=0 ",
+        label_delay=None,
+        use_presoftmax_prior_scale = False,
+        prior_scale_file = None,
+        include_log_softmax = True,
+        name_affix = None,
+        objective_type = "linear"):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
 
-    component_nodes.append("# m_t")
-    component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name))
+    if name_affix is not None:
+        final_node_prefix = 'Final-' + str(name_affix)
+    else:
+        final_node_prefix = 'Final'
+
+    prev_layer_output = AddAffineLayer(config_lines,
+            final_node_prefix , input, output_dim,
+            ng_affine_options)
+    if include_log_softmax:
+        if use_presoftmax_prior_scale :
+            components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file))
+            component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix,
+                prev_layer_output['descriptor']))
+            prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix)
+        prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output)
+    # we use the same name_affix as a prefix in for affine/scale nodes but as a
+    # suffix for output node
+    AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type)
+
+def AddFinalSigmoidLayer(config_lines, input, output_dim,
+        ng_affine_options = " param-stddev=0 bias-stddev=0 ",
+        label_delay=None,
+        name_affix = None,
+        objective_type = "quadratic"):
+    # Useful when you need the final outputs to be probabilities
+    # between 0 and 1.
+    # Usually used with an objective-type such as "quadratic"
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
 
-    # add the recurrent connections
-    if (add_recurrent_projection and add_non_recurrent_projection):
-        components.append("# projection matrices : Wrm and Wpm")
-        components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options))
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping))
-        component_nodes.append("# r_t and p_t")
-        component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
-        component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
-        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
-        output_descriptor = '{0}_rp_t'.format(name)
-        output_dim = recurrent_projection_dim + non_recurrent_projection_dim
+    if name_affix is not None:
+        final_node_prefix = 'Final-' + str(name_affix)
+    else:
+        final_node_prefix = 'Final'
 
-    elif add_recurrent_projection:
-        components.append("# projection matrices : Wrm")
-        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options))
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping))
-        component_nodes.append("# r_t")
-        component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name))
-        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
-        output_descriptor = '{0}_r_t'.format(name)
-        output_dim = recurrent_projection_dim
+    prev_layer_output = AddAffineLayer(config_lines,
+            final_node_prefix , input, output_dim,
+            ng_affine_options)
+    prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output)
+    AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type)
 
-    else:
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping))
-        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name))
-        output_descriptor = '{0}_r_t'.format(name)
-        output_dim = cell_dim
 
-    return {
-            'descriptor': output_descriptor,
-            'dimension':output_dim
-            }
 
-def AddClstmLayer(config_lines,
+def AddLstmLayer(config_lines,
                  name, input, cell_dim,
                  recurrent_projection_dim = 0,
                  non_recurrent_projection_dim = 0,
@@ -226,7 +237,7 @@ def AddClstmLayer(config_lines,
                  ng_per_element_scale_options = "",
                  ng_affine_options = "",
                  lstm_delay = -1,
-                 rates = [1]):
+                 self_repair_scale = None):
     assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -247,6 +258,7 @@ def AddClstmLayer(config_lines,
     else:
         add_non_recurrent_projection = True
 
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
     # Natural gradient per element scale parameters
     ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
     # Parameter Definitions W*(* replaced by - to have valid names)
@@ -268,12 +280,13 @@ def AddClstmLayer(config_lines,
     components.append("# Cell input matrices : W_c* matrices")
     components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
 
+
     components.append("# Defining the non-linearities")
-    components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim))
+    components.append("component name={0}_i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_g type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_h type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
 
     components.append("# Defining the cell computations")
     components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
@@ -346,6 +359,3 @@ def AddClstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
-
-
-
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index f4de09740ae..bfdfa4da23f 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -26,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 parallel_opts=  # ignored now.
 scoring_opts=
 skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
 feat_type=
 online_ivector_dir=
 minimize=false
@@ -132,7 +136,7 @@ if [ ! -z "$online_ivector_dir" ]; then
 fi
 
 if [ "$post_decode_acwt" == 1.0 ]; then
-  lat_wspecifier="ark|gzip -c >$dir/lat.JOB.gz"
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
 else
   lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
 fi
@@ -146,6 +150,10 @@ if [ $stage -le 1 ]; then
   $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
      --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
      --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
      --word-symbol-table=$graphdir/words.txt "$model" \
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index 88cf54e824e..2290c4d2e7f 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -34,6 +34,11 @@
         'shape':'box',
         'style':'filled'
     },
+    'ConvolutionComponent':{
+        'color':'lightpink',
+        'shape':'box',
+        'style':'filled'
+    },
     'FixedScaleComponent':{
         'color':'blueviolet',
         'shape':'box',
@@ -64,6 +69,11 @@
         'shape':'rectangle',
         'style':'filled'
     },
+    'ClipGradientComponent':{
+        'color':'bisque',
+        'shape':'rectangle',
+        'style':'filled'
+    },
     'ElementwiseProductComponent':{
         'color':'green',
         'shape':'rectangle',
@@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False):
     #   2. Nnet3 names can be shared among components and component nodes
     #      dot does not allow common names
     #
-    name_string = re.sub("-", "hyphen", name_string)
+    node_name_string = re.sub("-", "hyphen", name_string)
     if is_component:
-        name_string += name_string.strip() + "_component"
-    return name_string
+        node_name_string += node_name_string.strip() + "_component"
+    return {"label":name_string, "node":node_name_string}
 
 def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
@@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
         dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
 
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = "{{"+label+"}|Append}"
-    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     if attr_string != '':
         dot_string += ' [{0}] '.format(attr_string)
@@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
 
     return dot_graph
 
+def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None):
+    dot_graph = []
+
+    label = 'Round ({0})'.format(segment['arguments'][1])
+    style = None
+    if edge_attributes is not None:
+        if edge_attributes.has_key('label'):
+            label = "{0} {1}".format(edge_attributes['label'], label)
+        if edge_attributes.has_key('style'):
+            style  = 'style={0}'.format(edge_attributes['style'])
+
+    attr_string = 'label="{0}"'.format(label)
+    if style is not None:
+        attr_string += ' {0}'.format(style)
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
+                                                                    attr_string))
+    if segment['sub_segments']:
+        raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed")
+    return dot_graph
+
+
 def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
 
@@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes =
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed")
@@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
     dot_graph = []
     names = []
     desc_name = 'Sum_{0}'.format(affix)
+    # create the sum node
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
-        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
+        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i))
 
+    # link the sum node parts to corresponding segments
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = '{{'+label+'}|Sum}'
-    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     dot_string += ' [{0} tailport=s ] '.format(attr_string)
     dot_graph.append(dot_string)
@@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed")
@@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes
         dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'})
 
     if segment['arguments']:
-        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name)))
+        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node']))
 
     return dot_graph
 
@@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N
         dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes)
     elif segment['name'] == "ReplaceIndex":
         dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes)
+    elif segment['name'] == "Round":
+        dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes)
     else:
         raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name']))
     return dot_graph
@@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name):
             dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name)
     elif arguments:
         assert(len(arguments) == 1)
-        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name)))
+        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node']))
     return dot_lines
 
 def ParseNnet3String(string):
@@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None):
     except KeyError:
         pass
 
-    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)]
+    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)]
 
 
 # input-node name=input dim=40
 def Nnet3InputToDot(parsed_config):
-    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )]
+    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )]
 
 # output-node name=output input=Final_log_softmax dim=3940 objective=linear
+#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear
 def Nnet3OutputToDot(parsed_config):
     dot_graph = []
-    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective']))
-    dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name'])))
+    dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective']))
     return dot_graph
 
 # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256
 def Nnet3DimrangeToDot(parsed_config):
     dot_graph = []
-    dot_graph.append(parsed_config['name'])
-    dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name'])))
-    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']),
-                                                           GetDotNodeName(parsed_config['name']),
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'],
+                                                           GetDotNodeName(parsed_config['name'])['node'],
                                                            parsed_config['dim-offset'],
                                                            parsed_config['dim']))
     return dot_graph
@@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config):
 def Nnet3ComponentNodeToDot(parsed_config):
     dot_graph = []
     dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
-    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name']))
-    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True),
-                                                       GetDotNodeName(parsed_config['name'])))
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'],
+                                                       GetDotNodeName(parsed_config['name'])['node']))
     return dot_graph
 
 def GroupConfigs(configs, node_prefixes = []):
@@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
                         " will be clustered together in the dot-graph"
                         " --node-prefixes Lstm1,Lstm2,Layer1", default=None)
 
+    parser.add_argument("dotfile", help="name of the dot output file")
+
     print(' '.join(sys.argv), file=sys.stderr)
 
     args = parser.parse_args()
@@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
 
     lines = sys.stdin.readlines()
     dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes)
-    print("\n".join(dot_graph))
+
+    dotfile_handle = open(args.dotfile, "w")
+    dotfile_handle.write("\n".join(dot_graph))
+    dotfile_handle.close()
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index dc8cac9c0b0..364f6a72443 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -170,8 +170,8 @@ esac
 
 if [ -f $dir/trans.scp ]; then
   feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
-  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
-  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
 fi
 
 if [ ! -z "$online_ivector_dir" ]; then
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
new file mode 100644
index 00000000000..7fbc24858b5
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  
+#           2015-2016 Vimal Manohar
+# Apache 2.0.
+
+# This script is similar to steps/nnet3/get_egs.sh but used
+# when getting general targets (not from alignment directory) for raw nnet 
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the neural net (and also
+# the validation examples used for diagnostics), and puts them in separate archives.
+#
+# This script dumps egs with several frames of labels, controlled by the
+# frames_per_eg config variable (default: 8).  This takes many times less disk
+# space because typically we have 4 to 7 frames of context on the left and
+# right, and this ends up getting shared.  This is at the expense of slightly
+# higher disk I/O while training.
+
+
+# Begin configuration section.
+cmd=run.pl
+feat_type=raw       # set it to 'lda' to use LDA features.
+target_type=sparse  # dense to have dense targets, 
+                    # sparse to have posteriors targets
+num_targets=        # required for target-type=sparse with raw nnet
+frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
+                  # less time preparing egs, but more I/O during training.
+                  # note: the script may reduce this if reduce_frames_per_eg is true.
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+
+reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
+                           # if there is only one archive and even with the
+                           # reduced frames_per_eg, the number of
+                           # samples_per_iter that would result is less than or
+                           # equal to the user-specified value.
+num_utts_subset=300     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+samples_per_iter=400000 # this is the target number of egs in each archive of egs
+                        # (prior to merging egs).  We probably should have called
+                        # it egs_per_iter. This is just a guideline; it will pick
+                        # a number that divides the number of samples in the
+                        # entire data.
+
+transform_dir=     
+
+stage=0
+nj=6         # This should be set to the maximum number of jobs you are
+             # comfortable to run in parallel; you can increase it if your disk
+             # speed is greater and you have more machines.
+online_ivector_dir=  # can be used if we are including speaker information as iVectors.
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data> <targets-scp> <egs-dir>"
+  echo " e.g.: $0 data/train data/train/snr_targets.scp exp/tri4_nnet/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
+  echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
+  echo "                                                   # to use as input to the neural net."
+  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
+  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  exit 1;
+fi
+
+data=$1
+targets_scp=$2
+dir=$3
+
+# Check some files.
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $data/feats.scp $targets_scp $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log $dir/info
+
+
+# Get list of validation utterances.
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset | sort \
+    > $dir/valid_uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset | sort > $dir/train_subset_uttlist || exit 1;
+
+if [ ! -z "$transform_dir" ] && [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+
+
+## Set up features.
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
+   ;;
+  lda)
+    splice_opts=`cat $transform_dir/splice_opts 2>/dev/null`
+    # caution: the top-level nnet training script should copy these to its own dir now.
+    cp $transform_dir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
+    [ ! -z "$cmvn_opts" ] && \
+       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
+    cmvn_opts=$(cat $dir/cmvn_opts)
+    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1;
+esac
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim > $dir/info/ivector_dim
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+
+  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+else
+  echo 0 >$dir/info/ivector_dim
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# the + 1 is to round up, not down... we assume it doesn't divide exactly.
+num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
+# (for small data)- while reduce_frames_per_eg == true and the number of
+# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
+# by 1.
+reduced=false
+while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
+  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
+  frames_per_eg=$[$frames_per_eg-1]
+  num_archives=1
+  reduced=true
+done
+$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate]
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+! [ $egs_per_archive -le $samples_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done)
+  done
+fi
+
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
+
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+
+for n in `seq $nj`; do
+  utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp
+done
+
+targets_scp_split=$dir/targets.JOB.scp
+
+if [ $target_type == "dense" ]; then
+  num_targets=$(feat-to-dim "scp:$targets_scp" - 2>/dev/null) || exit 1
+fi
+
+if [ -z "$num_targets" ]; then
+  echo "$0: num-targets is not set" 
+  exit 1
+fi
+
+case $target_type in
+  "dense") 
+    get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets"
+
+    targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |"
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | copy-feats scp:- ark:- |"
+    train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | copy-feats scp:- ark:- |"
+    ;;
+  "sparse")
+    get_egs_program="nnet3-get-egs --num-pdfs=$num_targets"
+    targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |"
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" 
+    train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |"
+    ;;
+  default)
+    echo "$0: Unknown --target-type $target_type. Choices are dense and sparse"
+    exit 1
+esac
+
+if [ $stage -le 3 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm -f $dir/.error 2>/dev/null
+  $cmd $dir/log/create_valid_subset.log \
+    $get_egs_program \
+    $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    "$valid_targets" \
+    "ark:$dir/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    $get_egs_program \
+    $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+    "$train_subset_targets" \
+    "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    ark:$dir/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    ark:$dir/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    ark:$dir/train_diagnostic.egs || touch $dir/.error &
+  wait
+  sleep 5  # wait for file system to sync.
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
+
+  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm -f $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
+fi
+
+if [ $stage -le 4 ]; then
+  # create egs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  egs_list=
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+  # The examples will go round-robin to egs_list.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    $get_egs_program \
+    $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \
+    ark:- \| \
+    nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  egs_list=
+  for n in $(seq $nj); do
+    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
+  done
+
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \
+      nnet3-copy-egs ark:- $output_archives || exit 1;
+  fi
+
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: removing temporary archives"
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_intermediate); do
+      file=$dir/egs_orig.$x.$y.ark
+      [ -L $file ] && rm $(readlink -f $file)
+      rm $file
+    done
+  done
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/egs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary"
+  # Ignore errors below because trans.* might not exist.
+  rm -f $dir/trans.{ark,scp} $dir/targets.*.scp 2>/dev/null
+fi
+
+echo "$0: Finished preparing training examples"
+
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 17b8bea228d..9c2c641b0e9 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -8,7 +8,133 @@
 import copy
 import imp
 
-nodes = imp.load_source('', 'steps/nnet3/components.py')
+nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for LSTMs creation and training",
+                                     epilog="See steps/nnet3/lstm/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str,
+                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0")
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+
+    # LSTM options
+    parser.add_argument("--num-lstm-layers", type=int,
+                        help="Number of LSTM layers to be stacked", default=1)
+    parser.add_argument("--cell-dim", type=int,
+                        help="dimension of lstm-cell")
+    parser.add_argument("--recurrent-projection-dim", type=int,
+                        help="dimension of recurrent projection")
+    parser.add_argument("--non-recurrent-projection-dim", type=int,
+                        help="dimension of non-recurrent projection")
+    parser.add_argument("--hidden-dim", type=int,
+                        help="dimension of fully-connected layers")
+
+    # Natural gradient options
+    parser.add_argument("--ng-per-element-scale-options", type=str,
+                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
+    parser.add_argument("--ng-affine-options", type=str,
+                        help="options to be supplied to NaturalGradientAffineComponent", default="")
+
+    # Gradient clipper options
+    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
+    parser.add_argument("--clipping-threshold", type=float,
+                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
+    parser.add_argument("--self-repair-scale", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+    # Delay options
+    parser.add_argument("--label-delay", type=int, default=None,
+                        help="option to delay the labels to make the lstm robust")
+
+    parser.add_argument("--lstm-delay", type=str, default=None,
+                        help="option to have different delays in recurrence for each lstm")
+
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.num_lstm_layers < 1):
+        sys.exit("--num-lstm-layers has to be a positive integer")
+    if (args.clipping_threshold < 0):
+        sys.exit("--clipping-threshold has to be a non-negative")
+    if args.lstm_delay is None:
+        args.lstm_delay = [[-1]] * args.num_lstm_layers
+    else:
+        try:
+            args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
+        except ValueError:
+            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
+        if len(args.lstm_delay) != args.num_lstm_layers:
+            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
+
+    return args
 
 def PrintConfig(file_name, config_lines):
     f = open(file_name, 'w')
@@ -77,143 +203,60 @@ def ParseLstmDelayString(lstm_delay):
 
     return lstm_delay_array
 
-   
-if __name__ == "__main__":
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="Writes config files and variables "
-                                                 "for LSTMs creation and training",
-                                     epilog="See steps/nnet3/lstm/train.sh for example.")
-    # General neural network options
-    parser.add_argument("--splice-indexes", type=str,
-                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0")
-    parser.add_argument("--feat-dim", type=int,
-                        help="Raw feature dimension, e.g. 13")
-    parser.add_argument("--ivector-dim", type=int,
-                        help="iVector dimension, e.g. 100", default=0)
-    parser.add_argument("--include-log-softmax", type=str,
-                        help="add the final softmax layer ", default="true", choices = ["false", "true"])
 
-    # LSTM options
-    parser.add_argument("--num-lstm-layers", type=int,
-                        help="Number of LSTM layers to be stacked", default=1)
-    parser.add_argument("--cell-dim", type=int,
-                        help="dimension of lstm-cell")
-    parser.add_argument("--recurrent-projection-dim", type=int,
-                        help="dimension of recurrent projection")
-    parser.add_argument("--non-recurrent-projection-dim", type=int,
-                        help="dimension of non-recurrent projection")
-    parser.add_argument("--hidden-dim", type=int,
-                        help="dimension of fully-connected layers")
-
-    # Natural gradient options
-    parser.add_argument("--ng-per-element-scale-options", type=str,
-                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
-    parser.add_argument("--ng-affine-options", type=str,
-                        help="options to be supplied to NaturalGradientAffineComponent", default="")
-
-    # Gradient clipper options
-    parser.add_argument("--norm-based-clipping", type=str,
-                        help="use norm based clipping in ClipGradient components ", default="false", choices = ["false", "true"])
-    parser.add_argument("--clipping-threshold", type=float,
-                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=15)
-
-    parser.add_argument("--num-targets", type=int,
-                        help="number of network targets (e.g. num-pdf-ids/num-leaves)")
-    parser.add_argument("config_dir",
-                        help="Directory to write config files and variables")
-
-    # Delay options
-    parser.add_argument("--label-delay", type=int, default=None,
-                        help="option to delay the labels to make the lstm robust")
-
-    parser.add_argument("--lstm-delay", type=str, default=None,
-                        help="option to have different delays in recurrence for each lstm")
-
-
-
-    print(' '.join(sys.argv))
-
-    args = parser.parse_args()
-
-    if not os.path.exists(args.config_dir):
-        os.makedirs(args.config_dir)
-
-    ## Check arguments.
-    if args.splice_indexes is None:
-        sys.exit("--splice-indexes argument is required")
-    if args.feat_dim is None or not (args.feat_dim > 0):
-        sys.exit("--feat-dim argument is required")
-    if args.num_targets is None or not (args.num_targets > 0):
-        sys.exit("--feat-dim argument is required")
-    if (args.num_lstm_layers < 1):
-        sys.exit("--num-lstm-layers has to be a positive integer")
-    if (args.clipping_threshold < 0):
-        sys.exit("--clipping-threshold has to be a non-negative")
-    if args.lstm_delay is None:
-        lstm_delay = [-1] * args.num_lstm_layers
-    else:
-        try:
-            lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
-        except ValueError:
-            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
-        if len(lstm_delay) != args.num_lstm_layers:
-            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
-
-    parsed_splice_output = ParseSpliceString(args.splice_indexes.strip(), args.label_delay)
-    left_context = parsed_splice_output['left_context']
-    right_context = parsed_splice_output['right_context']
-    num_hidden_layers = parsed_splice_output['num_hidden_layers']
-    splice_indexes = parsed_splice_output['splice_indexes']
-
-    if (num_hidden_layers < args.num_lstm_layers):
-        sys.exit("--num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
-
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(args.config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    f.close()
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
+                splice_indexes, lstm_delay, cell_dim,
+                recurrent_projection_dim, non_recurrent_projection_dim,
+                num_lstm_layers, num_hidden_layers,
+                norm_based_clipping, clipping_threshold,
+                ng_per_element_scale_options, ng_affine_options,
+                label_delay, include_log_softmax, xent_regularize, self_repair_scale):
 
     config_lines = {'components':[], 'component-nodes':[]}
 
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputNode(init_config_lines, prev_layer_output)
-    config_files[args.config_dir + '/init.config'] = init_config_lines
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[config_dir + '/init.config'] = init_config_lines
 
-    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
 
-    for i in range(args.num_lstm_layers):
+    for i in range(num_lstm_layers):
 	if len(lstm_delay[i]) == 2: # BLSTM layer case, add both forward and backward
-            prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, args.cell_dim,
-                                             args.recurrent_projection_dim, args.non_recurrent_projection_dim,
-                                             args.clipping_threshold, args.norm_based_clipping,
-                                             args.ng_per_element_scale_options, args.ng_affine_options,
-                                             lstm_delay = lstm_delay[i][0])
-            prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, args.cell_dim,
-                                             args.recurrent_projection_dim, args.non_recurrent_projection_dim,
-                                             args.clipping_threshold, args.norm_based_clipping,
-                                             args.ng_per_element_scale_options, args.ng_affine_options,
-                                             lstm_delay = lstm_delay[i][1])
+            prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, cell_dim,
+                                             recurrent_projection_dim, non_recurrent_projection_dim,
+                                             clipping_threshold, norm_based_clipping,
+                                             ng_per_element_scale_options, ng_affine_options,
+                                             lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale)
+            prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, cell_dim,
+                                             recurrent_projection_dim, non_recurrent_projection_dim,
+                                             clipping_threshold, norm_based_clipping,
+                                             ng_per_element_scale_options, ng_affine_options,
+                                             lstm_delay = lstm_delay[i][1], self_repair_scale = self_repair_scale)
             prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor'])
 	    prev_layer_output['dimension'] = prev_layer_output1['dimension'] + prev_layer_output2['dimension']
 	else: # LSTM layer case
-	    prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, args.cell_dim,
-			                    args.recurrent_projection_dim, args.non_recurrent_projection_dim,
-					    args.clipping_threshold, args.norm_based_clipping,
-					    args.ng_per_element_scale_options, args.ng_affine_options,
-					    lstm_delay = lstm_delay[i][0])
+	    prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, cell_dim,
+			                    recurrent_projection_dim, non_recurrent_projection_dim,
+					    clipping_threshold, norm_based_clipping,
+					    ng_per_element_scale_options, ng_affine_options,
+					    lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, args.ng_affine_options, args.label_delay, args.include_log_softmax)
-        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 	if len(lstm_delay[i]) == 2:
 	    # since the form 'Append(Append(xx, yy), zz)' is not allowed, here we don't wrap the descriptor with 'Append()' so that we would have the form
@@ -223,17 +266,65 @@ def ParseLstmDelayString(lstm_delay):
     if len(lstm_delay[i]) == 2:
         # since there is no 'Append' in 'AffRelNormLayer', here we wrap the descriptor with 'Append()'
         prev_layer_output['descriptor'] = 'Append({0})'.format(prev_layer_output['descriptor'])
-    for i in range(args.num_lstm_layers, num_hidden_layers):
+    for i in range(num_lstm_layers, num_hidden_layers):
         prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
-                                               prev_layer_output, args.hidden_dim,
-                                               args.ng_affine_options)
+                                               prev_layer_output, hidden_dim,
+                                               ng_affine_options, self_repair_scale = self_repair_scale)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets, args.ng_affine_options, args.label_delay, args.include_log_softmax)
-        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 
     # printing out the configs
     # init.config used to train lda-mllt train
     for key in config_files.keys():
         PrintConfig(key, config_files[key])
+
+
+
+
+def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
+    parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+
+    if (num_hidden_layers < num_lstm_layers):
+        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
+    f.close()
+
+    return [left_context, right_context, num_hidden_layers, splice_indexes]
+
+
+def Main():
+    args = GetArgs()
+    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
+
+    MakeConfigs(args.config_dir,
+                args.feat_dim, args.ivector_dim, args.num_targets,
+                splice_indexes, args.lstm_delay, args.cell_dim,
+                args.recurrent_projection_dim, args.non_recurrent_projection_dim,
+                args.num_lstm_layers, num_hidden_layers,
+                args.norm_based_clipping,
+                args.clipping_threshold,
+                args.ng_per_element_scale_options, args.ng_affine_options,
+                args.label_delay, args.include_log_softmax, args.xent_regularize,
+                args.self_repair_scale)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh
index 10f6f793079..1717ea7b431 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/train.sh
+++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh
@@ -560,10 +560,13 @@ while [ $x -lt $num_iters ]; do
       cur_num_hidden_layers=$[1+$x/$add_layers_period]
       config=$dir/configs/layer$cur_num_hidden_layers.config
       raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+      cache_read_opt="" # an option for writing cache (storing pairs of nnet-computations
+                        # and computation-requests) during training.
     else
       do_average=true
       if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
       raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+      cache_read_opt="--read-cache=$dir/cache.$x"
     fi
     if $do_average; then
       this_num_chunk_per_minibatch=$num_chunk_per_minibatch
@@ -593,8 +596,15 @@ while [ $x -lt $num_iters ]; do
         k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive
                                                # the other indexes from.
         archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        if [ $n -eq 1 ]; then
+          # an option for writing cache (storing pairs of nnet-computations and
+          # computation-requests) during training.
+          cache_write_opt=" --write-cache=$dir/cache.$[$x+1]"
+        else
+          cache_write_opt=""
+        fi
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
-          nnet3-train $parallel_train_opts --print-interval=10 --momentum=$momentum \
+          nnet3-train $parallel_train_opts $cache_read_opt $cache_write_opt --print-interval=10 --momentum=$momentum \
           --max-param-change=$max_param_change \
           --optimization.min-deriv-time=$min_deriv_time "$raw" \
           "ark:nnet3-copy-egs $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_num_chunk_per_minibatch --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
@@ -641,6 +651,7 @@ while [ $x -lt $num_iters ]; do
       rm $dir/$[$x-1].mdl
     fi
   fi
+  rm $dir/cache.$x 2>/dev/null 
   x=$[$x+1]
   num_archives_processed=$[$num_archives_processed+$this_num_jobs]
 done
@@ -661,9 +672,6 @@ if [ $stage -le $num_iters ]; then
     nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
   done
 
-  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
-  # as if there are many models it can give out-of-memory error; and we set
-  # num-threads to 8 to speed it up (this isn't ideal...)
   combine_num_chunk_per_minibatch=$(python -c "print int(1024.0/($chunk_width))")
   $cmd $combine_queue_opt $dir/log/combine.log \
     nnet3-combine --num-iters=40 \
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
new file mode 100755
index 00000000000..af6afcb99e3
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python
+
+# tdnn or RNN with 'jesus layer'
+
+#  inputs to jesus layer:
+#      - for each spliced version of the previous layer the output (of dim  --jesus-forward-output-dim)
+
+#  outputs of jesus layer:
+#     for all layers:
+#       --jesus-forward-output-dim
+
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import re, os, argparse, sys, math, warnings
+import imp
+
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                 "for TDNNs creation and training",
+                                 epilog="See steps/nnet3/train_tdnn.sh for example.");
+parser.add_argument("--splice-indexes", type=str, required = True,
+                    help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. "
+                    "Note: recurrence indexes are optional, may not appear in 1st layer, and must be "
+                    "either all negative or all positive for any given layer.")
+
+# Only one of these arguments can be specified, and one of them has to
+# be compulsarily specified
+feat_group = parser.add_mutually_exclusive_group(required = True)
+feat_group.add_argument("--feat-dim", type=int,
+                        help="Raw feature dimension, e.g. 13")
+feat_group.add_argument("--feat-dir", type=str,
+                        help="Feature directory, from which we derive the feat-dim")
+
+# only one of these arguments can be specified
+ivector_group = parser.add_mutually_exclusive_group(required = False)
+ivector_group.add_argument("--ivector-dim", type=int,
+                            help="iVector dimension, e.g. 100", default=0)
+ivector_group.add_argument("--ivector-dir", type=str,
+                            help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+num_target_group = parser.add_mutually_exclusive_group(required = True)
+num_target_group.add_argument("--num-targets", type=int,
+                              help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+num_target_group.add_argument("--ali-dir", type=str,
+                              help="alignment directory, from which we derive the num-targets")
+num_target_group.add_argument("--tree-dir", type=str,
+                              help="directory with final.mdl, from which we derive the num-targets")
+
+parser.add_argument("--include-log-softmax", type=str,
+                    help="add the final softmax layer ", default="true", choices = ["false", "true"])
+parser.add_argument("--xent-regularize", type=float,
+                    help="For chain models, if nonzero, add a separate output for cross-entropy "
+                    "regularization (with learning-rate-factor equal to the inverse of this)",
+                    default=0.0)
+parser.add_argument("--xent-separate-forward-affine", type=str,
+                    help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                    default="false", choices = ["false", "true"])
+parser.add_argument("--use-repeated-affine", type=str,
+                    help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
+                    default="true", choices = ["false", "true"])
+parser.add_argument("--final-layer-learning-rate-factor", type=float,
+                    help="Learning-rate factor for final affine component",
+                    default=1.0)
+parser.add_argument("--self-repair-scale", type=float,
+                    help="Small scale involved in fixing derivatives, if supplied (e.g. try 0.00001)",
+                    default=0.0)
+parser.add_argument("--jesus-hidden-dim", type=int,
+                    help="hidden dimension of Jesus layer.", default=10000)
+parser.add_argument("--jesus-forward-output-dim", type=int,
+                    help="part of output dimension of Jesus layer that goes to next layer",
+                    default=1000)
+parser.add_argument("--jesus-forward-input-dim", type=int,
+                    help="Input dimension of Jesus layer that comes from affine projection "
+                    "from the previous layer (same as output dim of forward affine transform)",
+                    default=1000)
+parser.add_argument("--final-hidden-dim", type=int,
+                    help="Final hidden layer dimension-- or if <0, the same as "
+                    "--jesus-forward-input-dim", default=-1)
+parser.add_argument("--num-jesus-blocks", type=int,
+                    help="number of blocks in Jesus layer.  All configs of the form "
+                    "--jesus-*-dim will be rounded up to be a multiple of this.",
+                    default=100);
+parser.add_argument("--jesus-stddev-scale", type=float,
+                    help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)",
+                    default=1.0)
+parser.add_argument("--clipping-threshold", type=float,
+                    help="clipping threshold used in ClipGradient components (only relevant if "
+                    "recurrence indexes are specified).  If clipping-threshold=0 no clipping is done",
+                    default=15)
+parser.add_argument("config_dir",
+                    help="Directory to write config files and variables");
+
+print(' '.join(sys.argv))
+
+args = parser.parse_args()
+
+if not os.path.exists(args.config_dir):
+    os.makedirs(args.config_dir)
+
+## Check arguments.
+if args.feat_dir is not None:
+    args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+if args.ali_dir is not None:
+    args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+elif args.tree_dir is not None:
+    args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+if args.ivector_dir is not None:
+    args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+if not args.feat_dim > 0:
+    raise Exception("feat-dim has to be postive")
+
+if not args.num_targets > 0:
+    print(args.num_targets)
+    raise Exception("num_targets has to be positive")
+
+if not args.ivector_dim >= 0:
+    raise Exception("ivector-dim has to be non-negative")
+
+
+## Check arguments.
+if args.num_jesus_blocks < 1:
+    sys.exit("invalid --num-jesus-blocks value");
+if args.final_hidden_dim < 0:
+    args.final_hidden_dim = args.jesus_forward_input_dim
+
+for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim",
+              "final_hidden_dim" ]:
+    old_val = getattr(args, name)
+    if old_val % args.num_jesus_blocks != 0:
+        new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks)
+        printable_name = '--' + name.replace('_', '-')
+        print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3} '.format(
+                printable_name, old_val, new_val, args.num_jesus_blocks))
+        setattr(args, name, new_val);
+
+# this is a bit like a struct, initialized from a string, which describes how to
+# set up the statistics-pooling and statistics-extraction components.
+# An example string is 'mean(-99:3:9::99)', which means, compute the mean of
+# data within a window of -99 to +99, with distinct means computed every 9 frames
+# (we round to get the appropriate one), and with the input extracted on multiples
+# of 3 frames (so this will force the input to this layer to be evaluated
+# every 3 frames).  Another example string is 'mean+stddev(-99:3:9:99)',
+# which will also cause the standard deviation to be computed.
+class StatisticsConfig:
+    # e.g. c = StatisticsConfig('mean+stddev(-99:3:9:99)', 400, 'jesus1-forward-output-affine')
+    def __init__(self, config_string, input_dim, input_name):
+        self.input_dim = input_dim
+        self.input_name = input_name
+
+        m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)",
+                      config_string)
+        if m == None:
+            sys.exit("Invalid splice-index or statistics-config string: " + config_string)
+        self.output_stddev = (m.group(1) != 'mean')
+        self.left_context = -int(m.group(2))
+        self.input_period = int(m.group(3))
+        self.stats_period = int(m.group(4))
+        self.right_context = int(m.group(5))
+        if not (self.left_context > 0 and self.right_context > 0 and
+                self.input_period > 0 and self.stats_period > 0 and
+                self.left_context % self.stats_period == 0 and
+                self.right_context % self.stats_period == 0 and
+                self.stats_period % self.input_period == 0):
+            sys.exit("Invalid configuration of statistics-extraction: " + config_string)
+
+    # OutputDim() returns the output dimension of the node that this produces.
+    def OutputDim(self):
+        return self.input_dim * (2 if self.output_stddev else 1)
+
+    # OutputDims() returns an array of output dimensions, consisting of
+    # [ input-dim ] if just "mean" was specified, otherwise
+    # [ input-dim input-dim ]
+    def OutputDims(self):
+        return [ self.input_dim, self.input_dim ] if self.output_stddev else [ self.input_dim ]
+
+    # Descriptor() returns the textual form of the descriptor by which the
+    # output of this node is to be accessed.
+    def Descriptor(self):
+        return 'Round({0}-pooling-{1}-{2}, {3})'.format(self.input_name, self.left_context, self.right_context,
+                                                       self.stats_period)
+
+    # This function writes the configuration lines need to compute the specified
+    # statistics, to the file f.
+    def WriteConfigs(self, f):
+        print('component name={0}-extraction-{1}-{2} type=StatisticsExtractionComponent input-dim={3} '
+              'input-period={4} output-period={5} include-variance={6} '.format(
+                self.input_name, self.left_context, self.right_context,
+                self.input_dim, self.input_period, self.stats_period,
+                ('true' if self.output_stddev else 'false')), file=f)
+        print('component-node name={0}-extraction-{1}-{2} component={0}-extraction-{1}-{2} input={0} '.format(
+                self.input_name, self.left_context, self.right_context), file=f)
+        stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1)
+        print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} '
+              'input-period={4} left-context={1} right-context={2} num-log-count-features=0 '
+              'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context,
+                                           stats_dim, self.stats_period,
+                                           ('true' if self.output_stddev else 'false')),
+              file=f)
+        print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format(
+                self.input_name, self.left_context, self.right_context), file=f)
+
+
+
+
+## Work out splice_array
+## e.g. for
+## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'
+## we would have
+##   splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0]
+
+
+splice_array = []
+left_context = 0
+right_context = 0
+split_on_spaces = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+if len(split_on_spaces) < 2:
+    sys.exit("invalid --splice-indexes argument, too short: "
+             + args.splice_indexes)
+try:
+    for string in split_on_spaces:
+        this_layer = len(splice_array)
+
+        this_splices = string.split(",")
+        splice_array.append(this_splices)
+        # the rest of this block updates left_context and right_context, and
+        # does some checking.
+        leftmost_splice = 10000
+        rightmost_splice = -10000
+        for s in this_splices:
+            try:
+                n = int(s)
+                if n < leftmost_splice:
+                    leftmost_splice = n
+                if n > rightmost_splice:
+                    rightmost_splice = n
+            except:
+                if len(splice_array) == 1:
+                    sys.exit("First dimension of splicing array must not have averaging [yet]")
+                try:
+                    x = StatisticsConfig(s, 100, 'foo')
+                except:
+                    sys.exit("The following element of the splicing array is not a valid specifier "
+                    "of statistics: " + s)
+
+        if leftmost_splice == 10000 or rightmost_splice == -10000:
+            sys.exit("invalid element of --splice-indexes: " + string)
+        left_context += -leftmost_splice
+        right_context += rightmost_splice
+except ValueError as e:
+    sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e))
+left_context = max(0, left_context)
+right_context = max(0, right_context)
+num_hidden_layers = len(splice_array)
+input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
+
+f = open(args.config_dir + "/vars", "w")
+print('left_context=' + str(left_context), file=f)
+print('right_context=' + str(right_context), file=f)
+print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+f.close()
+
+
+f = open(args.config_dir + "/init.config", "w")
+print('# Config file for initializing neural network prior to', file=f)
+print('# preconditioning matrix computation', file=f)
+print('input-node name=input dim=' + str(args.feat_dim), file=f)
+list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ]
+if args.ivector_dim > 0:
+    print('input-node name=ivector dim=' + str(args.ivector_dim), file=f)
+    list.append('ReplaceIndex(ivector, t, 0)')
+# example of next line:
+# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"
+print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f)
+f.close()
+
+
+for l in range(1, num_hidden_layers + 1):
+    # the following summarizes the structure of the layers:  Here, the Jesus component includes ReLU at its input and output, and renormalize
+    #   at its output after the ReLU.
+    # layer1: splice + LDA-transform + affine + ReLU + renormalize
+    # layerX: splice + Jesus + affine + ReLU
+
+    # Inside the jesus component is:
+    #  [permute +] ReLU + repeated-affine + ReLU + repeated-affine
+    # [we make the repeated-affine the last one so we don't have to redo that in backprop].
+    # We follow this with a post-jesus composite component containing the operations:
+    #  [permute +] ReLU + renormalize
+    # call this post-jesusN.
+    # After this we use dim-range nodes to split up the output into
+    # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ]
+    # parts;
+    # and nodes for the jesusN-forward-affine.
+
+    f = open(args.config_dir + "/layer{0}.config".format(l), "w")
+    print('# Config file for layer {0} of the network'.format(l), file=f)
+    if l == 1:
+        print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'.
+              format(args.config_dir), file=f)
+        splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ]
+        if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)')
+        orig_input='Append({0})'.format(', '.join(splices))
+        # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)'
+        print('component-node name=lda component=lda input={0}'.format(orig_input),
+              file=f)
+        # after the initial LDA transform, put a trainable affine layer and a ReLU, followed
+        # by a NormalizeComponent.
+        print('component name=affine1 type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} bias-stddev=0'.format(
+                input_dim, args.jesus_forward_input_dim), file=f)
+        print('component-node name=affine1 component=affine1 input=lda',
+              file=f)
+        # the ReLU after the affine
+        print('component name=relu1 type=RectifiedLinearComponent dim={1} self-repair-scale={2}'.format(
+                l, args.jesus_forward_input_dim, args.self_repair_scale), file=f)
+        print('component-node name=relu1 component=relu1 input=affine1', file=f)
+        # the renormalize component after the ReLU
+        print ('component name=renorm1 type=NormalizeComponent dim={0} '.format(
+                args.jesus_forward_input_dim), file=f)
+        print('component-node name=renorm1 component=renorm1 input=relu1', file=f)
+        cur_output = 'renorm1'
+        cur_affine_output_dim = args.jesus_forward_input_dim
+    else:
+        splices = []
+        spliced_dims = []
+        for s in splice_array[l-1]:
+            # the connection from the previous layer
+            try:
+                offset = int(s)
+                # it's an integer offset.
+                splices.append('Offset({0}, {1})'.format(cur_output, offset))
+                spliced_dims.append(cur_affine_output_dim)
+            except:
+                # it's not an integer offset, so assume it specifies the
+                # statistics-extraction.
+                stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
+                stats.WriteConfigs(f)
+                splices.append(stats.Descriptor())
+                spliced_dims.extend(stats.OutputDims())
+
+        # get the input to the Jesus layer.
+        cur_input = 'Append({0})'.format(', '.join(splices))
+        cur_dim = sum(spliced_dims)
+
+        this_jesus_output_dim = args.jesus_forward_output_dim
+
+        # As input to the Jesus component we'll append the spliced input and any
+        # mean/stddev-stats input, and the first thing inside the component that
+        # we do is rearrange the dimensions so that things pertaining to a
+        # particular block stay together.
+
+        column_map = []
+        for x in range(0, args.num_jesus_blocks):
+            dim_offset = 0
+            for src_splice in spliced_dims:
+                src_block_size = src_splice / args.num_jesus_blocks
+                for y in range(0, src_block_size):
+                    column_map.append(dim_offset + (x * src_block_size) + y)
+                dim_offset += src_splice
+        if sorted(column_map) != range(0, sum(spliced_dims)):
+            print("column_map is " + str(column_map))
+            print("num_jesus_blocks is " + str(args.num_jesus_blocks))
+            print("spliced_dims is " + str(spliced_dims))
+            sys.exit("code error creating new column order")
+
+        need_input_permute_component = (column_map != range(0, sum(spliced_dims)))
+
+        # Now add the jesus component.
+
+        permute_offset = (1 if need_input_permute_component else 0)
+
+        if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim.
+            num_sub_components = 4 + permute_offset
+            hidden_else_output_dim = args.jesus_hidden_dim
+        else: # no hidden part in jesus layer.
+            num_sub_components = 2 + permute_offset
+            hidden_else_output_dim = args.jesus_forward_output_dim
+        print('component name=jesus{0} type=CompositeComponent num-components={1}'.format(
+                l, num_sub_components), file=f, end='')
+        # print the sub-components of the CompositeComopnent on the same line.
+        # this CompositeComponent has the same effect as a sequence of
+        # components, but saves memory.
+        if need_input_permute_component:
+            print(" component1='type=PermuteComponent column-map={1}'".format(
+                    l, ','.join([str(x) for x in column_map])), file=f, end='')
+        print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format(
+                1 + permute_offset,
+                cur_dim, args.self_repair_scale), file=f, end='')
+
+        if args.use_repeated_affine == "true":
+            print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                  "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                    2 + permute_offset,
+                    cur_dim, hidden_else_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks),
+                    0.5 * args.jesus_stddev_scale),
+                  file=f, end='')
+        else:
+            print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                  "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                    2 + permute_offset,
+                    cur_dim, hidden_else_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)),
+                  file=f, end='')
+
+        if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim.
+            print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format(
+                    3 + permute_offset, hidden_else_output_dim,
+                    args.self_repair_scale), file=f, end='')
+
+            if args.use_repeated_affine == "true":
+                print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                      "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                        4 + permute_offset,
+                        args.jesus_hidden_dim,
+                        this_jesus_output_dim,
+                        args.num_jesus_blocks,
+                        args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks),
+                        0.5 * args.jesus_stddev_scale),
+                      file=f, end='')
+            else:
+                print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                      "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                        4 + permute_offset,
+                        args.jesus_hidden_dim,
+                        this_jesus_output_dim,
+                        args.num_jesus_blocks,
+                        args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))),
+                      file=f, end='')
+
+        print("", file=f) # print newline.
+        print('component-node name=jesus{0} component=jesus{0} input={1}'.format(
+                l, cur_input), file=f)
+
+        # now print the post-Jesus component which consists of ReLU +
+        # renormalize.
+
+        num_sub_components = 2
+        print('component name=post-jesus{0} type=CompositeComponent num-components=2'.format(l),
+              file=f, end='')
+
+        # still within the post-Jesus component, print the ReLU
+        print(" component1='type=RectifiedLinearComponent dim={0} self-repair-scale={1}'".format(
+                this_jesus_output_dim, args.self_repair_scale), file=f, end='')
+        # still within the post-Jesus component, print the NormalizeComponent
+        print(" component2='type=NormalizeComponent dim={0} '".format(
+                this_jesus_output_dim), file=f, end='')
+        print("", file=f) # print newline.
+        print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l),
+              file=f)
+
+        # handle the forward output, we need an affine node for this:
+        cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim)
+        print('component name=forward-affine{0} type=NaturalGradientAffineComponent '
+              'input-dim={1} output-dim={2} bias-stddev=0'.
+              format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f)
+        print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input=post-jesus{0}'.format(
+            l), file=f)
+        # for each recurrence delay, create an affine node followed by a
+        # clip-gradient node.  [if there are multiple recurrences in the same layer,
+        # each one gets its own affine projection.]
+
+        # The reason we set the param-stddev to 0 is out of concern that if we
+        # initialize to nonzero, this will encourage the corresponding inputs at
+        # the jesus layer to become small (to remove this random input), which
+        # in turn will make this component learn slowly (due to small
+        # derivatives).  we set the bias-mean to 0.001 so that the ReLUs on the
+        # input of the Jesus layer are in the part of the activation that has a
+        # nonzero derivative- otherwise with this setup it would never learn.
+
+        cur_output = 'jesus{0}-forward-output-affine'.format(l)
+
+
+    # with each new layer we regenerate the final-affine component, with a ReLU before it
+    # because the layers we printed don't end with a nonlinearity.
+    print('component name=final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
+            cur_affine_output_dim, args.self_repair_scale), file=f)
+    print('component-node name=final-relu component=final-relu input={0}'.format(cur_output),
+          file=f)
+    print('component name=final-affine type=NaturalGradientAffineComponent '
+          'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format(
+            cur_affine_output_dim, args.num_targets,
+            args.final_layer_learning_rate_factor), file=f)
+    print('component-node name=final-affine component=final-affine input=final-relu',
+          file=f)
+    # printing out the next two, and their component-nodes, for l > 1 is not
+    # really necessary as they will already exist, but it doesn't hurt and makes
+    # the structure clearer.
+    if args.include_log_softmax == "true":
+        print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax component=final-log-softmax '
+              'input=final-affine', file=f)
+        print('output-node name=output input=final-log-softmax', file=f)
+    else:
+        print('output-node name=output input=final-affine', file=f)
+
+    if args.xent_regularize != 0.0:
+        xent_input = 'final-relu'
+        if l == num_hidden_layers and args.xent_separate_forward_affine == "true":
+            print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent '
+                  'input-dim={1} output-dim={2} bias-stddev=0'.
+                  format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f)
+            print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format(
+                    l), file=f)
+            print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
+                    args.final_hidden_dim, args.self_repair_scale), file=f)
+            print('component-node name=final-relu-xent component=final-relu-xent '
+                  'input=jesus{0}-forward-output-affine-xent'.format(l), file=f)
+            xent_input = 'final-relu-xent'
+
+        # This block prints the configs for a separate output that will be
+        # trained with a cross-entropy objective in the 'chain' models... this
+        # has the effect of regularizing the hidden parts of the model.  we use
+        # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+        # 1.0 / args.xent_regularize is suitable as it means the xent
+        # final-layer learns at a rate independent of the regularization
+        # constant; and the 0.5 was tuned so as to make the relative progress
+        # similar in the xent and regular final layers.
+        print('component name=final-affine-xent type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
+                cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
+        print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format(
+                xent_input), file=f)
+        print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '
+              'input=final-affine-xent', file=f)
+        print('output-node name=output-xent input=final-log-softmax-xent', file=f)
+
+    f.close()
diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
index 12c7a26e46d..8403c273a9d 100644
--- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
@@ -70,7 +70,7 @@
 splice_array = []
 left_context = 0
 right_context = 0
-split1 = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+split1 = args.splice_indexes.split();  # we already checked the string is nonempty.
 if len(split1) < 1:
     sys.exit("invalid --splice-indexes argument, too short: "
              + args.splice_indexes)
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
index 24666b8bd02..c36de8c16bf 100755
--- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
+++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 
 # script showing use of nnet3_to_dot.py
-# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti). 
+# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti).
 
 # Begin configuration section.
 component_attributes="name,type"
 node_prefixes=""
+info_bin=nnet3-am-info
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
@@ -20,7 +21,7 @@ if [ $# != 3 ]; then
   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
   echo "                                                # will be clustered together in the dot-graph"
 
-  
+
   exit 1;
 fi
 
@@ -29,10 +30,10 @@ dot_file=$2
 output_file=$3
 
 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"}
-nnet3-am-info $model | \
+$info_bin $model | \
   steps/nnet3/dot/nnet3_to_dot.py \
     --component-attributes "$component_attributes" \
-    $attr  > $dot_file
+    $attr $dot_file
 
 command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
-dot -Tpng $dot_file -o $output_file
+dot -Tpdf $dot_file -o $output_file
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
new file mode 100644
index 00000000000..166a6b85be2
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -0,0 +1,658 @@
+import subprocess
+import logging
+import math
+import re
+import time
+import argparse
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def SendMail(message, subject, email_id):
+    try:
+        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
+            message = message,
+            subject = subject,
+            email = email_id), shell=True)
+    except Exception as e:
+        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
+        pass
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+class NullstrToNoneAction(argparse.Action):
+    """ A custom action to convert empty strings passed by shell
+        to None in python. This is necessary as shell scripts print null strings
+        when a variable is not specified. We could use the more apt None
+        in python. """
+    def __call__(self, parser, namespace, values, option_string=None):
+            if values.strip() == "":
+                setattr(namespace, self.dest, None)
+            else:
+                setattr(namespace, self.dest, values)
+
+
+def CheckIfCudaCompiled():
+    p = subprocess.Popen("cuda-compiled")
+    p.communicate()
+    if p.returncode == 1:
+        return False
+    else:
+        return True
+
+def RunKaldiCommand(command, wait = True):
+    """ Runs commands frequently seen in Kaldi scripts. These are usually a
+        sequence of commands connected by pipes, so we use shell=True """
+    #logger.info("Running the command\n{0}".format(command))
+    p = subprocess.Popen(command, shell = True,
+                         stdout = subprocess.PIPE,
+                         stderr = subprocess.PIPE)
+
+    if wait:
+        [stdout, stderr] = p.communicate()
+        if p.returncode is not 0:
+            raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
+        return stdout, stderr
+    else:
+        return p
+
+def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
+    assert(num_models > 0)
+
+    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
+    objf = []
+    for i in range(num_models):
+        model_num = i + 1
+        logfile = re.sub('%', str(model_num), log_file_pattern)
+        lines = open(logfile, 'r').readlines()
+        this_objf = -100000
+        for line_num in range(1, len(lines) + 1):
+            # we search from the end as this would result in
+            # lesser number of regex searches. Python regex is slow !
+            mat_obj = parse_regex.search(lines[-1*line_num])
+            if mat_obj is not None:
+                this_objf = float(mat_obj.groups()[0])
+                break;
+        objf.append(this_objf);
+    max_index = objf.index(max(objf))
+    accepted_models = []
+    for i in range(num_models):
+        if (objf[max_index] - objf[i]) <= difference_threshold:
+            accepted_models.append(i+1)
+
+    if len(accepted_models) != num_models:
+        logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern))
+
+    return [accepted_models, max_index+1]
+
+def GetNumberOfLeaves(alidir):
+    [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
+    parts = stdout.split()
+    assert(parts[0] == "num-pdfs")
+    num_leaves = int(parts[1])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+def GetNumberOfJobs(alidir):
+    try:
+        num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip())
+    except IOError, ValueError:
+        raise Exception('Exception while reading the number of alignment jobs')
+    return num_jobs
+def GetIvectorDim(ivector_dir = None):
+    if ivector_dir is None:
+        return 0
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir = ivector_dir))
+    ivector_dim = int(stdout_val)
+    return ivector_dim
+
+def GetFeatDim(feat_dir):
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data = feat_dir))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
+def ReadKaldiMatrix(matrix_file):
+    try:
+        lines = map(lambda x: x.split(), open(matrix_file).readlines())
+        first_field = lines[0][0]
+        last_field = lines[-1][-1]
+        lines[0] = lines[0][1:]
+        lines[-1] = lines[-1][:-1]
+        if not (first_field == "[" and last_field == "]"):
+            raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script")
+        for i in range(len(lines)):
+            lines[i] = map(lambda x: int(float(x)), lines[i])
+        return lines
+    except IOError:
+        raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file))
+
+def WriteKaldiMatrix(output_file, matrix):
+    # matrix is a list of lists
+    file = open(output_file, 'w')
+    file.write("[ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to have the same length")
+        file.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file.write("\n")
+    file.write(" ]")
+    file.close()
+
+import shutil
+def CopyEgsPropertiesToExpDir(egs_dir, dir):
+    try:
+        for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
+            file_name = '{dir}/{file}'.format(dir = egs_dir, file = file)
+            if os.path.isfile(file_name):
+                shutil.copy2(file_name, dir)
+    except IOError:
+        raise Exception("Error while trying to copy egs property files to {dir}".format(dir = dir))
+
+def SplitData(data, num_jobs):
+   RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(data = data,
+                                                                  num_jobs = num_jobs))
+
+def ParseModelConfigVarsFile(var_file):
+    try:
+        var_file_handle = open(var_file, 'r')
+        model_left_context = None
+        model_right_context = None
+        num_hidden_layers = None
+        for line in var_file_handle:
+            parts = line.split('=')
+            field_name = parts[0].strip()
+            field_value = parts[1]
+            if field_name in ['model_left_context', 'left_context']:
+                model_left_context = int(field_value)
+            elif field_name in ['model_right_context', 'right_context']:
+                model_right_context = int(field_value)
+            elif field_name == 'num_hidden_layers':
+                num_hidden_layers = int(field_value)
+
+        if model_left_context is not None and model_right_context is not None and num_hidden_layers is not None:
+            return [model_left_context, model_right_context, num_hidden_layers]
+
+    except ValueError:
+        # we will throw an error at the end of the function so I will just pass
+        pass
+
+    raise Exception('Error while parsing the file {0}'.format(var_file))
+
+
+def GenerateEgs(data, alidir, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                samples_per_iter = 20000, frames_per_eg = 20,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+
+    RunKaldiCommand("""
+steps/nnet3/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  {data} {alidir} {egs_dir}
+      """.format(command = run_opts.command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, data = data, alidir = alidir,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
+    try:
+        egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
+        egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline())
+        egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline())
+        egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline())
+        if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
+            raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
+
+        if (egs_left_context < left_context) or (egs_right_context < right_context):
+            raise Exception('The egs have insufficient context')
+
+        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
+        num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
+
+        return [egs_left_context, egs_right_context, frames_per_eg, num_archives]
+    except IOError, ValueError:
+        raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir))
+
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs = None, rand_prune = 4.0,
+                                 lda_opts = None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+    RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+ nnet3-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command = run_opts.command,
+        num_lda_jobs = num_lda_jobs,
+        dir = dir,
+        egs_dir = egs_dir,
+        rand_prune = rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command = run_opts.command,
+        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+ nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+     """.format(command = run_opts.command,dir = dir,
+                lda_opts = lda_opts if lda_opts is not None else ""))
+
+    ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+import os, errno
+
+def ForceSymlink(file1, file2):
+    try:
+        os.symlink(file1, file2)
+    except OSError, e:
+        if e.errno == errno.EEXIST:
+            os.remove(file2)
+            os.symlink(file1, file2)
+
+def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
+                                presoftmax_prior_scale_power = None):
+
+    # getting the raw pdf count
+    RunKaldiCommand("""
+{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
+ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
+post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB
+     """.format(command = run_opts.command,
+                num_jobs = num_jobs,
+                dir = dir,
+                alidir = alidir))
+
+    RunKaldiCommand("""
+{command} {dir}/log/sum_pdf_counts.log \
+vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts
+       """.format(command = run_opts.command,  dir = dir))
+
+    import glob
+    for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
+        os.remove(file)
+
+    smooth=0.01
+    pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+    total = sum(pdf_counts)
+    average_count = total/len(pdf_counts)
+    scales = []
+    for i in range(len(pdf_counts)):
+        scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
+    num_pdfs = len(pdf_counts)
+    scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+
+    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
+    WriteKaldiMatrix(output_file, [scaled_counts])
+    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
+
+def PrepareInitialAcousticModel(dir, alidir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
+               dir = dir))
+
+  # Convert to .mdl, train the transitions, set the priors.
+    RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, alidir = alidir))
+
+def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
+                     num_archives, max_models_combine, add_layers_period,
+                     num_jobs_final):
+    """ Verifies that number of iterations are sufficient for various
+        phases of training."""
+
+    finish_add_layers_iter = num_hidden_layers * add_layers_period
+
+    if num_iters <= (finish_add_layers_iter + 2):
+        raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.')
+
+
+    approx_iters_per_epoch_final = num_archives/num_jobs_final
+    # First work out how many iterations we want to combine over in the final
+    # nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+    # number exceeds max_model_combine).  The number we use is:
+    # min(max(max_models_combine, approx_iters_per_epoch_final),
+    #     1/2 * iters_after_last_layer_added)
+    half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2
+    num_iters_combine = min(max(max_models_combine, approx_iters_per_epoch_final), half_iters_after_add_layers)
+    return num_iters_combine
+
+def GetRealignIters(realign_times, num_iters,
+                    num_jobs_initial, num_jobs_final):
+    """ Takes the realign_times string and identifies the approximate
+        iterations at which realignments have to be done."""
+    # realign_times is a space seperated string of values between 0 and 1
+
+    realign_iters = []
+    for realign_time in realign_times.split():
+        realign_time = float(realign_time)
+        assert(realign_time > 0 and realign_time < 1)
+        if num_jobs_initial == num_jobs_final:
+            realign_iter = int(0.5 + num_iters * realign_time)
+        else:
+            realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2)
+                            + realign_time * math.pow(num_jobs_final, 2))
+            realign_iter = realign_iter - num_jobs_initial
+            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter * num_iters
+        realign_iters.append(int(realign_iter))
+
+    return realign_iters
+
+def Align(dir, data, lang, run_opts, iter = None, transform_dir = None,
+          online_ivector_dir = None):
+
+    alidir = '{dir}/ali{ali_suffix}'.format(dir = dir,
+               ali_suffix = "_iter_{0}".format(iter) if iter is not None else "")
+
+    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
+        gpu = " using gpu " if run_opts.realign_use_gpu else " ",
+        num_jobs = run_opts.realign_num_jobs ))
+    RunKaldiCommand("""
+steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \
+        --use-gpu {align_use_gpu} \
+        --transform-dir "{transform_dir}" \
+        --online-ivector-dir "{online_ivector_dir}" \
+        --iter "{iter}" {data} {lang} {dir} {alidir}
+    """.format(dir = dir, align_use_gpu = "yes" if run_opts.realign_use_gpu else "no",
+               align_cmd = run_opts.realign_command,
+               align_queue_opt = run_opts.realign_queue_opt,
+               num_jobs_align = run_opts.realign_num_jobs,
+               transform_dir = transform_dir if transform_dir is not None else "",
+               online_ivector_dir = online_ivector_dir if online_ivector_dir is not None else "",
+               iter = iter if iter is not None else "",
+               alidir = alidir,
+               lang = lang, data = data))
+    return alidir
+
+def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
+            prior_subset_size, num_archives, run_opts,
+            transform_dir = None, online_ivector_dir = None):
+    raise Exception("Realignment stage has not been implemented in nnet3")
+    logger.info("Getting average posterior for purposes of adjusting the priors.")
+    # Note: this just uses CPUs, using a smallish subset of data.
+    # always use the first egs archive, which makes the script simpler;
+    # we're using different random subsets of it.
+
+    avg_post_vec_file = ComputeAveragePosterior(dir, iter, prev_egs_dir,
+                            num_archives, prior_subset_size, run_opts)
+
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    logger.info("Re-adjusting priors based on computed posteriors")
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts)
+
+    alidir = Align(dir, feat_dir, lang, run_opts, iter,
+                   transform_dir, online_ivector_dir)
+    RunKaldiCommand("""
+steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \
+    {prev_egs_dir} {cur_egs_dir}""".format(
+            command = run_opts.command,
+            iter = iter,
+            dir = dir,
+            alidir = alidir,
+            prev_egs_dir = prev_egs_dir,
+            cur_egs_dir = cur_egs_dir))
+
+def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
+                    num_archives_to_process,
+                    initial_effective_lrate, final_effective_lrate):
+    if iter + 1 >= num_iters:
+        effective_learning_rate = final_effective_lrate
+    else:
+        effective_learning_rate =  initial_effective_lrate * math.exp(num_archives_processed * math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process)
+
+    return num_jobs * effective_learning_rate
+
+def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
+
+    if iter == 0:
+        return True
+
+    try:
+        output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
+        output = output.strip().split("\n")
+        # eg.
+        # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
+
+        mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*")
+        total_mean_deriv = 0
+        num_derivs = 0
+        for line in output:
+            mat_obj = mean_pattern.search(line)
+            if mat_obj is None:
+                raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line))
+            mean_deriv = float(mat_obj.groups()[0])
+            total_mean_deriv += mean_deriv
+            num_derivs += 1
+        if total_mean_deriv / num_derivs < shrink_threshold:
+            return True
+    except ValueError:
+        raise Exception("Error while parsing the model info output")
+
+    return False
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, wait = False):
+
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+        "ark:nnet3-merge-egs ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               egs_dir = egs_dir), wait = wait)
+
+    RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+       "ark:nnet3-merge-egs ark:{egs_dir}/train_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               egs_dir = egs_dir), wait = wait)
+
+
+def ComputeProgress(dir, iter, egs_dir, run_opts, wait=False):
+
+    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
+nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
+"ark:nnet3-merge-egs --minibatch-size=256 ark:{egs_dir}/train_diagnostic.egs ark:-|"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               prev_model = prev_model,
+               egs_dir = egs_dir), wait = wait)
+
+def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
+                  run_opts, chunk_width = None):
+    # Now do combination.  In the nnet3 setup, the logic
+    # for doing averaging of subsets of the models in the case where
+    # there are too many models to reliably esetimate interpolation
+    # factors (max_models_combine) is moved into the nnet3-combine
+    raw_model_strings = []
+    print num_iters_combine
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      model_file = '{0}/{1}.mdl'.format(dir, iter)
+      if not os.path.exists(model_file):
+          raise Exception('Model file {0} missing'.format(model_file))
+      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+
+    if chunk_width is not None:
+        # this is an RNN model
+        mbsize = int(1024.0/(chunk_width))
+    else:
+        mbsize = 1024
+
+    RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-combine --num-iters=40 \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {raw_models} "ark:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
+    """.format(command = run_opts.command,
+               combine_queue_opt = run_opts.combine_queue_opt,
+               dir = dir, raw_models = " ".join(raw_model_strings),
+               mbsize = mbsize,
+               num_iters = num_iters,
+               egs_dir = egs_dir))
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+
+def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
+                            prior_subset_size, run_opts):
+    # Note: this just uses CPUs, using a smallish subset of data.
+    """ Computes the average posterior of the network"""
+    import glob
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+
+    if run_opts.num_jobs_compute_prior > num_archives:
+        egs_part = 1
+    else:
+        egs_part = 'JOB'
+
+    RunKaldiCommand("""
+{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
+    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
+    nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
+  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
+    """.format(command = run_opts.command,
+               dir = dir,
+               num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
+               prior_queue_opt = run_opts.prior_queue_opt,
+               iter = iter, prior_subset_size = prior_subset_size,
+               egs_dir = egs_dir, egs_part = egs_part,
+               prior_gpu_opt = run_opts.prior_gpu_opt))
+
+    # make sure there is time for $dir/post.{iter}.*.vec to appear.
+    time.sleep(5)
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    RunKaldiCommand("""
+{command} {dir}/log/vector_sum.{iter}.log \
+    vector-sum {dir}/post.{iter}.*.vec {output_file}
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, output_file = avg_post_vec_file))
+
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+    return avg_post_vec_file
+
+def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts):
+    RunKaldiCommand("""
+{command} {dir}/log/adjust_priors.final.log \
+nnet3-am-adjust-priors {input_model} {avg_posterior_vector} {output_model}
+    """.format(command = run_opts.command,
+               dir = dir, input_model = input_model,
+               avg_posterior_vector = avg_posterior_vector,
+               output_model = output_model))
+
+def RemoveEgs(egs_dir):
+    RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir))
+
+def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
+                 preserve_model_interval = 100,
+                 remove_egs = True):
+    try:
+        if remove_egs:
+            RemoveEgs(egs_dir)
+
+        for iter in range(num_iters):
+            RemoveModel(nnet_dir, iter, num_iters, 1,
+                        preserve_model_interval)
+    except (IOError, OSError) as err:
+        logger.warning("Error while cleaning up the nnet directory")
+        raise err
+
+def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
+               preserve_model_interval = 100):
+    if iter % preserve_model_interval == 0:
+        return
+    if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
+        return
+    file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    if os.path.isfile(file_name):
+        os.remove(file_name)
+
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
new file mode 100755
index 00000000000..5c64aab18f0
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+import warnings
+import imp
+import argparse
+import os
+import errno
+import logging
+import re
+import subprocess
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+try:
+    import matplotlib as mpl
+    mpl.use('Agg')
+    import matplotlib.pyplot as plt
+    from matplotlib.backends.backend_pdf import PdfPages
+    import numpy as np
+
+    plot = True
+except ImportError:
+    warnings.warn("""
+This script requires matplotlib and numpy. Please install them to generate plots. Proceeding with generation of tables.
+If you are on a cluster where you do not have admin rights you could try using virtualenv.""")
+
+nlp = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Generating plots')
+
+
+
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description="""
+Parses the training logs and generates a variety of plots.
+example : steps/nnet3/report/generate_plots.py --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 exp/nnet3/tdnn exp/nnet3/tdnn/report
+""")
+    parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
+    parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
+    parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start")
+    parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
+    parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
+
+    args = parser.parse_args()
+    if args.comparison_dir is not None and len(args.comparison_dir) > 6:
+        raise Exception("max 6 --comparison-dir options can be specified. If you want to compare with more comparison_dir, you would have to carefully tune the plot_colors variable which specified colors used for plotting.")
+    assert(args.start_iter >= 1)
+    return args
+
+plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan' ]
+
+
+
+class LatexReport:
+    def __init__(self, pdf_file):
+        self.pdf_file = pdf_file
+        self.document=[]
+        self.document.append("""
+\documentclass[prl,10pt,twocolumn]{revtex4}
+\usepackage{graphicx}    % Used to import the graphics
+\\begin{document}
+""")
+
+    def AddFigure(self, figure_pdf, title):
+        # we will have keep extending this replacement list based on errors during compilation
+        # escaping underscores in the title
+        title = "\\texttt{"+re.sub("_","\_", title)+"}"
+        fig_latex = """
+%...
+\\begin{figure}[t]
+  \\begin{center}
+    \caption{""" + title + """}
+    \includegraphics[width=\\textwidth]{""" + figure_pdf + """}
+  \end{center}
+\end{figure}
+%...
+"""
+        self.document.append(fig_latex)
+
+    def Close(self):
+        self.document.append("\end{document}")
+        return self.Compile()
+
+    def Compile(self):
+        root, ext = os.path.splitext(self.pdf_file)
+        dir_name = os.path.dirname(self.pdf_file)
+        latex_file = root + ".tex"
+        lat_file = open(latex_file, "w")
+        lat_file.write("\n".join(self.document))
+        lat_file.close()
+        try:
+            proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            proc.communicate()
+        except Exception as e:
+            logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file))
+            return False
+        return True
+
+def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None):
+    assert(start_iter >= 1)
+
+    if plot:
+        fig = plt.figure()
+        plots = []
+
+    comparison_dir = [] if comparison_dir is None else comparison_dir
+    dirs = [exp_dir] + comparison_dir
+    index = 0
+    for dir in dirs:
+        [accuracy_report, accuracy_times, accuracy_data] = nlp.GenerateAccuracyReport(dir, key)
+        if index == 0:
+            # this is the main experiment directory
+            acc_file = open("{0}/{1}.log".format(output_dir, file_basename), "w")
+            acc_file.write(accuracy_report)
+            acc_file.close()
+
+        if plot:
+            color_val = plot_colors[index]
+            data = np.array(accuracy_data)
+            if data.shape[0] == 0:
+                raise Exception("Couldn't find any rows for the accuracy plot")
+            data = data[data[:,0]>=start_iter, :]
+            plot_handle, = plt.plot(data[:, 0], data[:, 1], color = color_val, linestyle = "--", label = "train {0}".format(dir))
+            plots.append(plot_handle)
+            plot_handle, = plt.plot(data[:, 0], data[:, 2], color = color_val, label = "valid {0}".format(dir))
+            plots.append(plot_handle)
+        index += 1
+    if plot:
+        plt.xlabel('Iteration')
+        plt.ylabel(key)
+        lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1 ), ncol=1, borderaxespad=0.)
+        plt.grid(True)
+        fig.suptitle("{0} plot".format(key))
+        figfile_name = '{0}/{1}.pdf'.format(output_dir, file_basename)
+        plt.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+        if latex_report is not None:
+            latex_report.AddFigure(figfile_name, "Plot of {0} vs iterations".format(key))
+
+def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None):
+    assert(start_iter >= 1)
+
+    comparison_dir = [] if comparison_dir is None else comparison_dir
+    dirs = [exp_dir] + comparison_dir
+    index = 0
+    stats_per_dir = {}
+
+    for dir in dirs:
+        stats_per_component_per_iter = nlp.ParseProgressLogsForNonlinearityStats(dir)
+        stats_per_dir[dir] = stats_per_component_per_iter
+
+    # convert the nonlin stats into tables
+    stat_tables_per_component_per_dir = {}
+    for dir in dirs:
+        stats_per_component_per_iter = stats_per_dir[dir]
+        component_names = stats_per_component_per_iter.keys()
+        stat_tables_per_component = {}
+        for component_name in component_names:
+            comp_data = stats_per_component_per_iter[component_name]
+            comp_type = comp_data['type']
+            comp_stats = comp_data['stats']
+            iters = comp_stats.keys()
+            iters.sort()
+            iter_stats = []
+            for iter in iters:
+                iter_stats.append([iter] + comp_stats[iter])
+            stat_tables_per_component[component_name] = iter_stats
+        stat_tables_per_component_per_dir[dir] = stat_tables_per_component
+
+    main_stat_tables = stat_tables_per_component_per_dir[exp_dir]
+    for component_name in main_stat_tables.keys():
+        # this is the main experiment directory
+        file = open("{dir}/nonlinstats_{comp_name}.log".format(dir = output_dir, comp_name = component_name), "w")
+        file.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n")
+        iter_stat_report = ""
+        iter_stats = main_stat_tables[component_name]
+        for row in iter_stats:
+            iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n"
+        file.write(iter_stat_report)
+        file.close()
+
+    if plot:
+        main_component_names = main_stat_tables.keys()
+        main_component_names.sort()
+
+        plot_component_names = set(main_component_names)
+        for dir in dirs:
+            component_names = set(stats_per_dir[dir].keys())
+            plot_component_names = plot_component_names.intersection(component_names)
+        plot_component_names = list(plot_component_names)
+        plot_component_names.sort()
+        if plot_component_names != main_component_names:
+            logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.")
+
+        fig = plt.figure()
+        for component_name in main_component_names:
+            fig.clf()
+            index = 0
+            plots = []
+            for dir in dirs:
+                color_val = plot_colors[index]
+                index += 1
+                try:
+                    iter_stats = stat_tables_per_component_per_dir[dir][component_name]
+                except KeyError:
+                    # this component is not available in this network so lets not just plot it
+                    continue
+
+                data = np.array(iter_stats)
+                data = data[data[:,0] >=start_iter, :]
+                ax = plt.subplot(211)
+                mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Mean {0}".format(dir))
+                msph, = ax.plot(data[:,0], data[:,1] + data[:,2], color=color_val, linestyle='--', label = "Mean+-Stddev {0}".format(dir))
+                mspl, = ax.plot(data[:,0], data[:,1] - data[:,2], color=color_val, linestyle='--')
+                plots.append(mp)
+                plots.append(msph)
+                ax.set_ylabel('Value-{0}'.format(comp_type))
+                ax.grid(True)
+
+                ax = plt.subplot(212)
+                mp, = ax.plot(data[:,0], data[:,3], color=color_val)
+                msph, = ax.plot(data[:,0], data[:,3] + data[:,4], color=color_val, linestyle='--')
+                mspl, = ax.plot(data[:,0], data[:,3] - data[:,4], color=color_val, linestyle='--')
+                ax.set_xlabel('Iteration')
+                ax.set_ylabel('Derivative-{0}'.format(comp_type))
+                ax.grid(True)
+
+            lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
+            plt.grid(True)
+            fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name))
+            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+            if latex_report is not None:
+                latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name))
+
+def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False):
+    try:
+        os.makedirs(output_dir)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(output_dir):
+            pass
+        else:
+            raise e
+    if plot:
+        latex_report = LatexReport("{0}/report.pdf".format(output_dir))
+    else:
+        latex_report = None
+
+    if is_chain:
+        logger.info("Generating log-probability plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    else:
+        logger.info("Generating accuracy plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+        logger.info("Generating log-likelihood plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+    logger.info("Generating non-linearity stats plots")
+    GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+    logger.info("Generating parameter difference files")
+    # Parameter changes
+    key_file = {"Parameter differences":"parameter.diff",
+                "Relative parameter differences":"relative_parameter.diff"}
+    for key in key_file.keys():
+        file = open("{0}/{1}".format(output_dir, key_file[key]), "w")
+        data = nlp.ParseProgressLogsForParamDiff(exp_dir, key)
+        for row in data:
+            file.write(" ".join(map(lambda x:str(x),row))+"\n")
+        file.close()
+    if plot and latex_report is not None:
+        has_compiled = latex_report.Close()
+        if has_compiled:
+            logger.info("Report has been generated. You can find it at the location {0}".format("{0}/report.pdf".format(output_dir)))
+
+def Main():
+    args = GetArgs()
+    GeneratePlots(args.exp_dir, args.output_dir,
+                  comparison_dir = args.comparison_dir,
+                  start_iter = args.start_iter,
+                  is_chain = args.is_chain)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
new file mode 100755
index 00000000000..1c2f3a1e9b8
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
@@ -0,0 +1,155 @@
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+from __future__ import division
+import sys, glob, re, math, datetime, argparse
+import imp
+
+ntl = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+#exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
+def ParseProgressLogsForNonlinearityStats(exp_dir):
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    stats_per_component_per_iter = {}
+
+    progress_log_lines  = ntl.RunKaldiCommand('grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files))[0]
+
+    parse_regex = re.compile(".*progress.([0-9]+).log:component name=(.+) type=(.*)Component,.*value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]")
+    for line in progress_log_lines.split("\n") :
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        # groups  = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', '0.134', '0.0397')
+        iteration = int(groups[0])
+        component_name = groups[1]
+        component_type = groups[2]
+        value_mean = float(groups[3])
+        value_stddev = float(groups[4])
+        deriv_mean = float(groups[5])
+        deriv_stddev = float(groups[6])
+        try:
+            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
+        except KeyError:
+            stats_per_component_per_iter[component_name] = {}
+            stats_per_component_per_iter[component_name]['type'] = component_type
+            stats_per_component_per_iter[component_name]['stats'] = {}
+            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
+
+    return stats_per_component_per_iter
+
+def ParseDifferenceString(string):
+    dict = {}
+    for parts in string.split():
+        sub_parts = parts.split(":")
+        dict[sub_parts[0]] = float(sub_parts[1])
+    return dict
+
+#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
+def ParseProgressLogsForParamDiff(exp_dir, pattern):
+    if pattern not in set(["Relative parameter differences", "Parameter differences"]):
+        raise Exception("Unknown value for pattern : {0}".format(pattern))
+
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    progress_per_iter = {}
+    component_names = set([])
+    progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format(pattern, progress_log_files))[0]
+    parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern))
+    for line in progress_log_lines.split("\n") :
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        iteration = groups[0]
+        differences = ParseDifferenceString(groups[1])
+        component_names  = component_names.union(differences.keys())
+        progress_per_iter[int(iteration)] = differences
+
+    component_names = list(component_names)
+    component_names.sort()
+    # rearranging the data into an array
+    data = []
+    data.append(["iteration"]+component_names)
+    max_iter = max(progress_per_iter.keys())
+    for iter in range(max_iter + 1):
+        try:
+            component_dict = progress_per_iter[iter]
+        except KeyError:
+            continue
+        iter_values = []
+        for component_name in component_names:
+            try:
+                iter_values.append(component_dict[component_name])
+            except KeyError:
+                # the component was not found this iteration, may be because of layerwise discriminative training
+                iter_values.append(0)
+        data.append([iter] + iter_values)
+
+    return data
+
+def ParseTrainLogs(exp_dir):
+  train_log_files = "%s/log/train.*.log" % (exp_dir)
+  train_log_lines = ntl.RunKaldiCommand('grep -e Accounting {0}'.format(train_log_files))[0]
+  parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*")
+
+  train_times = {}
+  for line in train_log_lines.split('\n'):
+    mat_obj = parse_regex.search(line)
+    if mat_obj is not None:
+        groups = mat_obj.groups()
+        try:
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+        except KeyError:
+            train_times[int(groups[0])] = {}
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+  iters = train_times.keys()
+  for iter in iters:
+      values = train_times[iter].values()
+      train_times[iter] = max(values)
+  return train_times
+
+def ParseProbLogs(exp_dir, key = 'accuracy'):
+    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
+    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
+    train_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, train_prob_files), wait = True)[0]
+    valid_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, valid_prob_files))[0]
+
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames.
+    parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-e]+) .*per frame")
+    train_loss={}
+    valid_loss={}
+
+
+    for line in train_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                train_loss[int(groups[0])] = groups[2]
+    for line in valid_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                valid_loss[int(groups[0])] = groups[2]
+    iters = list(set(valid_loss.keys()).intersection(train_loss.keys()))
+    iters.sort()
+    return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters)
+
+def GenerateAccuracyReport(exp_dir, key = "accuracy"):
+    times = ParseTrainLogs(exp_dir)
+    data = ParseProbLogs(exp_dir, key)
+    report = []
+    report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    for x in data:
+        try:
+            report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1]))
+        except KeyError:
+            continue
+
+    total_time = 0
+    for iter in times.keys():
+        total_time += times[iter]
+    report.append("Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time))))
+    return ["\n".join(report), times, data]
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
new file mode 100755
index 00000000000..57291324d28
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -0,0 +1,485 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import sys
+import warnings
+import copy
+import imp
+import ast
+
+nodes = imp.load_source('', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training",
+                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str, required = True,
+                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3'")
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                        default=False, choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+    parser.add_argument("--pnorm-input-dim", type=int,
+                        help="input dimension to p-norm nonlinearities")
+    parser.add_argument("--pnorm-output-dim", type=int,
+                        help="output dimension of p-norm nonlinearities")
+    parser.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+
+    parser.add_argument("--self-repair-scale", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+
+    parser.add_argument("--pool-type", type=str, default = 'none',
+                        help="Type of pooling to be used.", choices = ['low-pass', 'weighted-average', 'per-dim-weighted-average', 'multi-dim-weighted-average', 'none'])
+    parser.add_argument("--pool-window", type=int, default = None,
+                        help="Width of the pooling window")
+    parser.add_argument("--pool-lpfilter-width", type=float,
+                        default = None, help="Nyquist frequency of the lpfilter to be used for pooling")
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = True)
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.subset_dim < 0):
+        raise Exception("--subset-dim has to be non-negative")
+    if (args.pool_window is not None) and (args.pool_window <= 0):
+        raise Exception("--pool-window has to be positive")
+
+    if not args.relu_dim is None:
+        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
+            raise Exception("--relu-dim argument not compatible with "
+                            "--pnorm-input-dim or --pnorm-output-dim options");
+        args.nonlin_input_dim = args.relu_dim
+        args.nonlin_output_dim = args.relu_dim
+    else:
+        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
+            raise Exception("--relu-dim not set, so expected --pnorm-input-dim and "
+                            "--pnorm-output-dim to be provided.");
+        args.nonlin_input_dim = args.pnorm_input_dim
+        args.nonlin_output_dim = args.pnorm_output_dim
+
+    return args
+
+def AddPerDimAffineLayer(config_lines, name, input, input_window):
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i)
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineLayer(config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats, num_feats)
+
+    return [output_descriptor, filter_context, filter_context]
+
+def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim):
+    assert(block_input_dim % input_window== 0)
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i)
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineLayer(config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window))
+
+    return [output_descriptor, filter_context, filter_context]
+
+def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False):
+    try:
+        import scipy.signal as signal
+        import numpy as np
+    except ImportError:
+        raise Exception(" This recipe cannot be run without scipy."
+                        " You can install it using the command \n"
+                        " pip install scipy\n"
+                        " If you do not have admin access on the machine you are"
+                        " trying to run this recipe, you can try using"
+                        " virtualenv")
+    # low-pass smoothing of input was specified. so we will add a low-pass filtering layer
+    lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0)
+    lp_filter = list(np.append(lp_filter, 0))
+    nnet3_train_lib.WriteKaldiMatrix(lpfilt_filename, [lp_filter])
+    filter_context = int((num_lpfilter_taps - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+    input_x_dim = len(filter_input_splice_indexes)
+    input_y_dim = input['dimension']
+    input_z_dim = 1
+    filt_x_dim = len(filter_input_splice_indexes)
+    filt_y_dim = 1
+    filt_x_step = 1
+    filt_y_step = 1
+    input_vectorization = 'zyx'
+
+    tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name,
+                                                     filter_input_descriptor,
+                                                     input_x_dim, input_y_dim, input_z_dim,
+                                                     filt_x_dim, filt_y_dim,
+                                                     filt_x_step, filt_y_step,
+                                                     1, input_vectorization,
+                                                     filter_bias_file = lpfilt_filename,
+                                                     is_updatable = is_updatable)
+
+
+    return [tdnn_input_descriptor, filter_context, filter_context]
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseSpliceString(splice_indexes):
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        raise Exception("invalid splice-indexes argument, too short: "
+                 + splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                raise Exception("invalid splice-indexes argument, too-short element: "
+                         + splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                raise Exception("elements of splice-indexes must be sorted: "
+                         + splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        raise Exception("invalid splice-indexes argument " + splice_indexes + e)
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+def MakeConfigs(config_dir, splice_indexes_string,
+                feat_dim, ivector_dim, num_targets,
+                nonlin_input_dim, nonlin_output_dim, subset_dim,
+                pool_type, pool_window, pool_lpfilter_width,
+                use_presoftmax_prior_scale, final_layer_normalize_target,
+                include_log_softmax, xent_regularize, xent_separate_forward_affine, self_repair_scale):
+
+    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
+
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
+
+    if xent_separate_forward_affine:
+        if splice_indexes[-1] != [0]:
+            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+        # if specified, pool the input from the previous layer
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            if pool_type != "none" and pool_window is None:
+                raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(pool_type))
+            if pool_type in set(["low-pass", "weighted-average"]):
+                if pool_type == "weighted-average":
+                    lpfilter_is_updatable = True
+                else:
+                    lpfilter_is_updatable = False
+                # low-pass filter the input to smooth it before the sub-sampling
+                [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines,
+                                                                                      'Tdnn_input_smoother_{0}'.format(i),
+                                                                                       prev_layer_output,
+                                                                                       pool_lpfilter_width,
+                                                                                       pool_window,
+                                                                                       config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i),
+                                                                                       is_updatable = lpfilter_is_updatable)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            elif pool_type == "per-dim-weighted-average":
+                # add permute component to shuffle the feature columns of the Append descriptor output so
+                # that columns corresponding to the same feature index are contiguous
+                # add a block-affine component to collapse all the feature indexes across time steps into a single value
+                [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines,
+                                                                                            'Tdnn_input_PDA_{0}'.format(i),
+                                                                                            prev_layer_output,
+                                                                                            pool_window)
+
+                left_context += cur_left_context
+                right_context += cur_right_context
+            elif pool_type == "multi-dim-weighted-average":
+                [prev_layer_output, cur_left_context, cur_right_context] = AddMultiDimAffineLayer(config_lines,
+                                                                                                  'Tdnn_input_PDA_{0}'.format(i),
+                                                                                                   prev_layer_output,
+                                                                                                   pool_window,
+                                                                                                   10 * pool_window, 10)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+
+        if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            if xent_regularize == 0.0:
+                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
+
+            prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
+                                                    prev_layer_output, nonlin_output_dim,
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax)
+
+
+            prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
+                                                    prev_layer_output, nonlin_output_dim,
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
+                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                    0.5 / xent_regularize),
+                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                prior_scale_file = prior_scale_file,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+        else:
+            prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                        prev_layer_output, nonlin_output_dim,
+                                                        self_repair_scale = self_repair_scale,
+                                                        norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax)
+
+            if xent_regularize != 0.0:
+                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                          0.5 / xent_regularize),
+                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                    prior_scale_file = prior_scale_file,
+                                    include_log_softmax = True,
+                                    name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
+
+def Main():
+    args = GetArgs()
+
+    MakeConfigs(config_dir = args.config_dir,
+                splice_indexes_string = args.splice_indexes,
+                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
+                num_targets = args.num_targets,
+                nonlin_input_dim = args.nonlin_input_dim,
+                nonlin_output_dim = args.nonlin_output_dim,
+                subset_dim = args.subset_dim,
+                pool_type = args.pool_type, pool_window = args.pool_window,
+                pool_lpfilter_width = args.pool_lpfilter_width,
+                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
+                final_layer_normalize_target = args.final_layer_normalize_target,
+                include_log_softmax = args.include_log_softmax,
+                xent_regularize = args.xent_regularize,
+                xent_separate_forward_affine = args.xent_separate_forward_affine,
+                self_repair_scale = args.self_repair_scale)
+
+if __name__ == "__main__":
+    Main()
+
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
new file mode 100755
index 00000000000..773e10ccab6
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -0,0 +1,660 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+pnorm_input_dim=3000
+pnorm_output_dim=300
+relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+presoftmax_prior_scale_power=-0.25
+use_presoftmax_prior_scale=true
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-6
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+# count space-separated fields in splice_indexes to get num-hidden-layers.
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+# note: hidden layers which are composed of one or more components,
+# so hidden layer indexing is different from component count
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+            # only relevant for "raw" features, not lda.
+feat_type=raw  # or set to 'lda' to use LDA features.
+align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
+align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
+realign_times=          # List of times on which we realign.  Each time is
+                        # floating point number strictly between 0 and 1, which
+                        # will be multiplied by the num-iters to get an iteration
+                        # number.
+num_jobs_align=30       # Number of jobs for realignment
+# End configuration section.
+frames_per_eg=8 # to be passed on to get_egs.sh
+subset_dim=0
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
+  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
+  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
+  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
+  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+if [ ! -z "$realign_times" ]; then
+  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
+  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
+fi
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
+[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
+[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $alidir/tree $dir
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+case $feat_type in
+  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+    ;;
+  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
+   # get num-rows in lda matrix, which is the lda feature dim.
+   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
+    ;;
+  *)
+   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
+esac
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+
+if [ $stage -le -5 ]; then
+  echo "$0: creating neural net configs";
+
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --splice-indexes "$splice_indexes"  \
+    --subset-dim "$subset_dim" \
+    --feat-dim $feat_dim \
+    --ivector-dim $ivector_dim  \
+     $dim_opts \
+    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
+    --num-targets  $num_leaves  \
+   $dir/configs || exit 1;
+
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# left_context=(something)
+# right_context=(something)
+# num_hidden_layers=(something)
+. $dir/configs/vars || exit 1;
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir $transform_dir)
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
+      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+      --cmd "$cmd" $egs_opts \
+      --frames-per-eg $frames_per_eg \
+      $data $alidir $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+
+if [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -2 ]; then
+  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
+  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
+
+  # obtains raw pdf count
+  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
+     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
+  $cmd $dir/log/sum_pdf_counts.log \
+       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
+  rm $dir/pdf_counts.*
+
+  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
+     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
+        num_pdfs=NF-2;  average_count = total/num_pdfs;
+        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
+        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
+     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
+  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
+fi
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat and
+  # presoftmax_prior_scale.vec.
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+  # Convert to .mdl, train the transitions, set the priors.
+  $cmd $dir/log/init_mdl.log \
+    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+for realign_time in $realign_times; do
+  # Work out the iterations on which we will re-align, if the --realign-times
+  # option was used.  This is slightly approximate.
+  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
+    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
+  # the next formula is based on the one for mix_up_iter above.
+  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
+  realign_this_iter[$realign_iter]=$realign_time
+done
+
+cur_egs_dir=$egs_dir
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+  if [ ! -z "${realign_this_iter[$x]}" ]; then
+    prev_egs_dir=$cur_egs_dir
+    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
+  fi
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    if [ ! -z "${realign_this_iter[$x]}" ]; then
+      time=${realign_this_iter[$x]}
+
+      echo "Getting average posterior for purposes of adjusting the priors."
+      # Note: this just uses CPUs, using a smallish subset of data.
+      # always use the first egs archive, which makes the script simpler;
+      # we're using different random subsets of it.
+      rm $dir/post.$x.*.vec 2>/dev/null
+      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
+        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
+        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+      $cmd $dir/log/vector_sum.$x.log \
+        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+      rm $dir/post.$x.*.vec;
+
+      echo "Re-adjusting priors based on computed posteriors"
+      $cmd $dir/log/adjust_priors.$x.log \
+        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
+
+      sleep 2
+
+      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
+        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
+        --iter $x $data $lang $dir $dir/ali_$time || exit 1
+
+      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
+        $prev_egs_dir $cur_egs_dir || exit 1
+
+      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
+        steps/nnet3/remove_egs.sh $prev_egs_dir
+      fi
+    fi
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+            "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+           "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    mdl=$dir/$iter.mdl
+    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+       --enforce-sum-to-one=true --enforce-positive-weights=true \
+       --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
+    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm $dir/post.$x.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.final.log \
+    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $cur_egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
new file mode 100644
index 00000000000..8f33272b97f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -0,0 +1,551 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014-2016  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-6
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+configs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+frames_per_eg=8 # to be passed on to get_egs.sh
+
+# Raw nnet training options i.e. without transition model
+nj=4  
+dense_targets=true        # Use dense targets instead of sparse targets
+
+# End configuration section.
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data> <targets-scp> <exp-dir>"
+  echo " e.g.: $0 data/train scp:snr_targets/targets.scp exp/nnet3_snr_predictor"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+targets_scp=$2
+dir=$3
+
+# Check some files.
+for f in $data/feats.scp $targets_scp; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+if $add_final_sigmoid && $include_log_softmax; then
+  echo "add-final-sigmoid and include-log-softmax cannot both be true"
+fi
+
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+if [ ! -z "$configs_dir" ]; then
+  cp -rT $configs_dir $dir/configs || exit 1
+fi
+
+if [ $stage -le -5 ]; then
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# model_left_context=(something)
+# model_right_context=(something)
+# num_hidden_layers=(something)
+# num_targets=(something)
+# add_lda=(true|false)
+# include_log_softmax=(true|false)
+# objective_type=(something)
+. $dir/configs/vars || exit 1;
+left_context=$model_left_context
+right_context=$model_right_context
+
+[ -z "$num_targets" ] && echo "\$num_targets is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$add_lda" ] && echo "\$add_lda is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$include_log_softmax" ] && echo "\$include_log_softmax is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$objective_type" ] && echo "\$objective_type is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+if $dense_targets; then
+  tmp_num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1
+
+  if [ $tmp_num_targets -ne $num_targets ]; then
+    echo "Mismatch between num-targets provided to script vs configs"
+    exit 1
+  fi
+fi
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir "$transform_dir")
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+
+  if $dense_targets; then
+    target_type=dense
+  else
+    target_type=sparse
+  fi
+
+  steps/nnet3/get_egs_targets.sh $egs_opts "${extra_opts[@]}" \
+    --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+    --cmd "$cmd" --nj $nj \
+    --frames-per-eg $frames_per_eg \
+    --target-type $target_type --num-targets $num_targets \
+    $data $targets_scp $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+
+if $add_lda && [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+
+compute_accuracy=false
+if [ "$objective_type" == "linear" ]; then
+  compute_accuracy=true
+fi
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
+      "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
+      "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no $dir/$[x-1].raw $dir/$x.raw \
+        "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info $dir/$x.raw &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - | nnet3-init --srand=$x - $config - |"
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list $dir/$[x+1].raw || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-copy $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].raw ] && exit 1;
+    if [ -f $dir/$[$x-1].raw ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].raw
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.raw"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    nnet=$dir/$iter.raw
+    [ ! -f $nnet ] && echo "Expected $nnet to exist" && exit 1;
+    nnets_list[$n]=$nnet
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+    --enforce-sum-to-one=true --enforce-positive-weights=true \
+    --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$egs_dir/combine.egs ark:-|" \
+    $dir/final.raw || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
+    "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
+    "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &
+fi
+
+if $include_log_softmax && [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purpose of using as prior to convert posteriors to likelihoods."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+    $dir/final.raw ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm -f $dir/post.$x.*.vec;
+
+fi
+
+
+if [ ! -f $dir/final.raw ]; then
+  echo "$0: $dir/final.raw does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.raw
+    fi
+  done
+fi
+
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
new file mode 100755
index 00000000000..cde3ef14933
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -0,0 +1,632 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains a feed forward DNN acoustic model using the cross-entropy objective.
+    DNNs include simple DNNs, TDNNs and CNNs.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
+                        default = 8,
+                        help="Number of output labels per example")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help="Controls randomization of the samples on each"
+                        "iteration. If 0 or a large value the randomization is"
+                        "complete, but this will consume memory and cause spikes"
+                        "in disk I/O.  Smaller is easier on disk and memory but"
+                        "less random.  It's not a huge deal though, as samples"
+                        "are anyway randomized right at the start."
+                        "(the point of this is to get data in different"
+                        "minibatches on different iterations, since in the"
+                        "preconditioning method, 2 samples in the same minibatch"
+                        "can affect each others' gradients.")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="The maximum change in parameters allowed per minibatch,"
+                        "measured in Frobenius norm over the entire model")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
+                        default=-0.25,
+                        help="")
+
+    # Realignment parameters
+    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
+                        default=None, action=NullstrToNoneAction,
+                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
+    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
+                        default=30,
+                        help="Number of jobs to use for realignment")
+    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
+                        default=None, action=NullstrToNoneAction,
+                        help="""A space seperated string of realignment
+                        times. Values must be between 0 and 1
+                        e.g. '0.1 0.2 0.3' """)
+
+    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
+                        default=True, action=StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, gpu is used with steps/nnet3/align.sh")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
+                        default = 512,
+                        help="Size of the minibatch used to compute the gradient")
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--lang", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--ali-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.frames_per_eg < 1:
+        raise Exception("--egs.frames-per-eg should have a minimum value of 1")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs"
+        " directory which is the output of make_configs.py script")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.ali_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    if args.realign_use_gpu is True:
+        run_opts.realign_use_gpu = True
+        run_opts.realign_queue_opt = "--gpu 1"
+    else:
+        run_opts.realign_use_gpu = False
+        run_opts.realign_queue_opt = ""
+
+    if args.realign_command is None:
+        run_opts.realign_command = args.command
+    else:
+        run_opts.realign_command = args.realign_command
+    run_opts.realign_num_jobs = args.realign_num_jobs
+
+    run_opts.command = args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+        self.realign_use_gpu = None
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame = (k / num_archives) % frames_per_eg
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  "{raw_model}" \
+  "ark:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     frame = frame,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     minibatch_size = minibatch_size),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts):
+
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file )
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+      cur_minibatch_size = minibatch_size
+      cur_max_param_change = max_param_change
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_minibatch_size = minibatch_size / 2
+      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    num_leaves = GetNumberOfLeaves(args.ali_dir)
+    num_jobs = GetNumberOfJobs(args.ali_dir)
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if (args.stage <= -5):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -4) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                    left_context, right_context,
+                    left_context, right_context, run_opts,
+                    frames_per_eg = args.frames_per_eg,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.frames_per_eg == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -3):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+    if (args.stage <= -2):
+        logger.info("Computing initial vector for FixedScaleComponent before"
+                    " softmax, using priors^{prior_scale} and rescaling to"
+                    " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
+
+        ComputePresoftmaxPriorScale(args.dir, args.ali_dir, num_jobs, run_opts,
+                                    presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_expanded = num_archives * args.frames_per_eg
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives_expanded,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    realign_iters = []
+    if args.realign_times is not None:
+        realign_iters = GetRealignIters(args.realign_times,
+                                        num_iters,
+                                        args.num_jobs_initial,
+                                        args.num_jobs_final)
+        print(realign_iters)
+    # egs_dir will be updated if there is realignment
+    cur_egs_dir=egs_dir
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if iter in realign_iters:
+                logger.info("Re-aligning the data at iteration {0}".format(iter))
+                prev_egs_dir=cur_egs_dir
+                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
+                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
+                Realign(args.dir, iter, args.feat_dir, args.lang,
+                        prev_egs_dir, cur_egs_dir,
+                        args.prior_subset_size, num_archives, run_opts,
+                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
+                if args.cleanup and args.egs_dir is None:
+                    RemoveEgs(prev_egs_dir)
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              args.minibatch_size, args.frames_per_eg,
+                              num_hidden_layers, args.add_layers_period,
+                              left_context, right_context,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+
+    if args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purposes of adjusting the priors.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts)
+
+        logger.info("Re-adjusting priors based on computed posteriors")
+        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
+        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
new file mode 100755
index 00000000000..463b0a0d3ff
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -0,0 +1,704 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains an RNN acoustic model using the cross-entropy objective.
+    RNNs include LSTMs, BLSTMs and GRUs.
+    RNN acoustic model training differs from feed-forward DNN training
+    in the following ways
+        1. RNN acoustic models train on output chunks rather than individual
+           outputs
+        2. The training includes additional stage of shrinkage, where
+           the parameters of the model are scaled when the derivative averages
+           at the non-linearities are below a threshold.
+        3. RNNs can also be trained with state preservation training
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 20,
+                        help="""Number of output labels in the sequence
+                        used to train an LSTM.
+                        Caution: if you double this you should halve
+                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 40,
+                        help="""Number of left steps used in the estimation of LSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="""Number of right steps used in the estimation of BLSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=20000,
+                        help="""This is really the number of egs in each
+                        archive.  Each eg has 'chunk_width' frames in it--
+                        for chunk_width=20, this value (20k) is equivalent
+                        to the 400k number that we use as a default in
+                        regular DNN training.""")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Realignment parameters
+    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
+                        default=None, action=NullstrToNoneAction,
+                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
+    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
+                        default=30,
+                        help="Number of jobs to use for realignment")
+    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
+                        default=None, action=NullstrToNoneAction,
+                        help="""A space seperated string of realignment
+                        times. Values must be between 0 and 1
+                        e.g. '0.1 0.2 0.3' """)
+
+    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
+                        default=True, action=StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, gpu is used with steps/nnet3/align.sh")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.5,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 0.99,
+                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=100,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
+                        default=None,
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--lang", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--ali-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be positive")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be positive")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.ali_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    if args.realign_use_gpu is True:
+        run_opts.realign_use_gpu = True
+        run_opts.realign_queue_opt = "--gpu 1"
+    else:
+        run_opts.realign_use_gpu = False
+        run_opts.realign_queue_opt = ""
+
+    if args.realign_command is None:
+        run_opts.realign_command = args.command
+    else:
+        run_opts.realign_command = args.realign_command
+    run_opts.realign_num_jobs = args.realign_num_jobs
+
+    run_opts.command = args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+class NullstrToNoneAction(argparse.Action):
+    """ A custom action to convert empty strings passed by shell
+        to None in python. This is necessary as shell scripts print null strings
+        when a variable is not specified. We could use the more apt None
+        in python. """
+    def __call__(self, parser, namespace, values, option_string=None):
+            if values.strip() == "":
+                setattr(namespace, self.dest, None)
+            else:
+                setattr(namespace, self.dest, values)
+
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+        self.realign_use_gpu = None
+
+
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  "ark:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     min_deriv_time = min_deriv_time,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context, min_deriv_time,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts):
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file )
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    num_leaves = GetNumberOfLeaves(args.ali_dir)
+    num_jobs = GetNumberOfJobs(args.ali_dir)
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                    left_context, right_context,
+                    args.chunk_width + left_context,
+                    args.chunk_width + right_context, run_opts,
+                    frames_per_eg = args.chunk_width,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    realign_iters = []
+    if args.realign_times is not None:
+        realign_iters = GetRealignIters(args.realign_times,
+                                        num_iters,
+                                        args.num_jobs_initial,
+                                        args.num_jobs_final)
+        print(realign_iters)
+    # egs_dir will be updated if there is realignment
+    cur_egs_dir=egs_dir
+
+    if args.num_bptt_steps is None:
+        num_bptt_steps = args.chunk_width
+    else:
+        num_bptt_steps = args.num_bptt_steps
+
+    min_deriv_time = args.chunk_width - num_bptt_steps
+
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if iter in realign_iters:
+                logger.info("Re-aligning the data at iteration {0}".format(iter))
+                prev_egs_dir=cur_egs_dir
+                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
+                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
+                Realign(args.dir, iter, args.feat_dir, args.lang,
+                        prev_egs_dir, cur_egs_dir,
+                        args.prior_subset_size, num_archives, run_opts,
+                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
+                if args.cleanup and args.egs_dir is None:
+                    RemoveEgs(prev_egs_dir)
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value,
+                              args.num_chunk_per_minibatch,
+                              num_hidden_layers, args.add_layers_period,
+                              left_context, right_context, min_deriv_time,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                chunk_width = args.chunk_width)
+
+    if args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purposes of adjusting the priors.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts)
+
+        logger.info("Re-adjusting priors based on computed posteriors")
+        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
+        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+def SendMail(message, subject, email_id):
+    try:
+        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
+            message = message,
+            subject = subject,
+            email = email_id), shell=True)
+    except Exception as e:
+        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
+        pass
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index e17026e496f..d8ac11da720 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -93,6 +93,7 @@ echo -n >$ieconf
 cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
 echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
 for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
+echo "--ivector-period=$ivector_period" >>$ieconf
 echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
 echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
 echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
diff --git a/egs/wsj/s5/steps/paste_feats.sh b/egs/wsj/s5/steps/paste_feats.sh
index da82179f616..abeee5aba23 100755
--- a/egs/wsj/s5/steps/paste_feats.sh
+++ b/egs/wsj/s5/steps/paste_feats.sh
@@ -44,10 +44,10 @@ done
 
 mkdir -p $ark_dir $logdir
 
-mkdir -p $data 
+mkdir -p $data
 cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
-rm $data/cmvn.scp 2>/dev/null 
-rm $data/feats.scp 2>/dev/null 
+rm $data/cmvn.scp 2>/dev/null
+rm $data/feats.scp 2>/dev/null
 
 # use "name" as part of name of the archive.
 name=`basename $data`
@@ -58,19 +58,25 @@ for data_src in ${data_src_arr[@]}; do
   data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp"
 done
 
+for n in $(seq $nj); do
+  # the next command does nothing unless $arkdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $arkdir/pasted_$name.$n.ark
+done
+
 $cmd JOB=1:$nj $logdir/append.JOB.log \
    paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \
    copy-feats --compress=$compress ark:- \
     ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;
-              
+
 # concatenate the .scp files together.
 for ((n=1; n<=nj; n++)); do
   cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
 done > $data/feats.scp || exit 1;
 
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   echo "consider using utils/fix_data_dir.sh $data"
diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh
index 8a2aee9d48d..f054ebdb41d 100755
--- a/egs/wsj/s5/steps/score_kaldi.sh
+++ b/egs/wsj/s5/steps/score_kaldi.sh
@@ -137,6 +137,12 @@ if [ $stage -le 1 ]; then
       cat $dir/scoring_kaldi/wer_details/per_utt \| \
       utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
       sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+    
   fi
 fi
 
diff --git a/egs/wsj/s5/steps/score_kaldi_compare.sh b/egs/wsj/s5/steps/score_kaldi_compare.sh
new file mode 100755
index 00000000000..91fc057b906
--- /dev/null
+++ b/egs/wsj/s5/steps/score_kaldi_compare.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright 2016 Nicolas Serrano
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+replications=10000
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_compare.sh [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --replications <int>            # number of bootstrap evaluation to compute confidence."
+  exit 1;
+fi
+
+dir1=$1
+dir2=$2
+dir_compare=$3
+
+mkdir -p $dir_compare/log
+
+for d in $dir1 $dir2; do
+  for f in test_filt.txt best_wer; do
+    [ ! -f $d/$f ] && echo "score_compare.sh: no such file $d/$f" && exit 1;
+  done
+done
+
+
+best_wer_file1=$(awk '{print $NF}' $dir1/best_wer)
+best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \
+        awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}')
+
+best_wer_file2=$(awk '{print $NF}' $dir2/best_wer)
+best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \
+        awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}')
+
+$cmd $dir_compare/log/score_compare.log \
+  compute-wer-bootci --replications=$replications \
+    ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \
+    '>' $dir_compare/wer_bootci_comparison || exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/steps/select_feats.sh b/egs/wsj/s5/steps/select_feats.sh
index 970823fdf25..072dd3194cf 100755
--- a/egs/wsj/s5/steps/select_feats.sh
+++ b/egs/wsj/s5/steps/select_feats.sh
@@ -43,31 +43,31 @@ mkdir -p $ark_dir $logdir
 mkdir -p $data
 
 cp $data_in/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
-rm $data/cmvn.scp 2>/dev/null 
-rm $data/feats.scp 2>/dev/null 
+rm $data/cmvn.scp 2>/dev/null
+rm $data/feats.scp 2>/dev/null
 
 # use "name" as part of name of the archive.
 name=`basename $data`
 
-for j in $(seq $nj); do 
+for j in $(seq $nj); do
   # the next command does nothing unless $mfccdir/storage/ exists, see
   # utils/create_data_link.pl for more info.
-  utils/create_data_link.pl $ark_dir/pasted_$name.$j.ark
+  utils/create_data_link.pl $ark_dir/selected_$name.$j.ark
 done
 
 $cmd JOB=1:$nj $logdir/append.JOB.log \
    select-feats "$selector" scp:$data_in/split$nj/JOB/feats.scp ark:- \| \
    copy-feats --compress=$compress ark:- \
-    ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;
-              
+    ark,scp:$ark_dir/selected_$name.JOB.ark,$ark_dir/selected_$name.JOB.scp || exit 1;
+
 # concatenate the .scp files together.
 for ((n=1; n<=nj; n++)); do
-  cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
+  cat $ark_dir/selected_$name.$n.scp >> $data/feats.scp || exit 1;
 done > $data/feats.scp || exit 1;
 
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   exit 1;
diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh
index bb4d4e77e7c..5e1a9cba470 100755
--- a/egs/wsj/s5/utils/copy_data_dir.sh
+++ b/egs/wsj/s5/utils/copy_data_dir.sh
@@ -46,7 +46,7 @@ srcdir=$1
 destdir=$2
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "copy_data_dir.sh: no such file $srcdir/utt2spk" 
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -82,7 +82,7 @@ if [ -f $srcdir/segments ]; then
     cp $srcdir/reco2file_and_channel $destdir/
   fi
 else # no segments->wav indexed by utt.
-  if [ -f $srcdir/wav.scp ]; then 
+  if [ -f $srcdir/wav.scp ]; then
     utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
   fi
 fi
@@ -90,6 +90,9 @@ fi
 if [ -f $srcdir/text ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
 fi
+if [ -f $srcdir/utt2dur ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
 if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
diff --git a/egs/wsj/s5/utils/create_split_dir.pl b/egs/wsj/s5/utils/create_split_dir.pl
index 0c4f023f7f3..dc94f3bad43 100755
--- a/egs/wsj/s5/utils/create_split_dir.pl
+++ b/egs/wsj/s5/utils/create_split_dir.pl
@@ -53,6 +53,7 @@
   # If the symbolic link already exists, delete it.
   if (-l $pseudo_storage) {
     print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n";
+    $index++;
     next;
   }
 
diff --git a/egs/wsj/s5/utils/data/combine_data.sh b/egs/wsj/s5/utils/data/combine_data.sh
new file mode 120000
index 00000000000..0aed7e823b7
--- /dev/null
+++ b/egs/wsj/s5/utils/data/combine_data.sh
@@ -0,0 +1 @@
+../combine_data.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/copy_data_dir.sh b/egs/wsj/s5/utils/data/copy_data_dir.sh
new file mode 120000
index 00000000000..b9854db4655
--- /dev/null
+++ b/egs/wsj/s5/utils/data/copy_data_dir.sh
@@ -0,0 +1 @@
+../copy_data_dir.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
new file mode 100755
index 00000000000..77f5f8eb7dc
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script takes as input a data directory, such as data/train/, preferably
+# with utt2dur file already existing (or the utt2dur file will be created if
+# not), and it attempts to work out the approximate frame shift by comparing the
+# utt2dur with the output of feat-to-len on the feats.scp.  It prints it out.
+# if the shift is very close to, but above, 0.01 (the normal frame shift) it
+# rounds it down.
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
+  echo "If <datadir> does not contain utt2dur, this script will call utils/data/get_utt2dur.sh,"
+  echo "which will require write permission to <datadir>"
+  exit 1
+fi
+
+export LC_ALL=C
+
+dir=$1
+
+if [ ! -f $dir/utt2dur ]; then
+  echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
+  utils/data/get_utt2dur.sh $dir 1>&2
+fi
+
+if [ ! -f $dir/feats.scp ]; then
+  echo "$0: $dir/feats.scp does not exist" 1>&2
+  exit 1
+fi
+
+temp=$(mktemp /tmp/tmp.XXXX)
+
+feat-to-len scp:$dir/feats.scp ark,t:- | head -n 10 > $temp
+
+if [ -z $temp ]; then
+  echo "$0: error running feat-to-len" 1>&2
+  exit 1
+fi
+
+head -n 10 $dir/utt2dur | paste - $temp | \
+   awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }' || exit 1;
+
+rm $temp
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh
new file mode 100755
index 00000000000..9c4aae5e693
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_num_frames.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script works out the approximate number of frames in a training directory.
+# This is sometimes needed by higher-level scripts
+
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  (
+    echo "Usage: $0 <data-dir>"
+    echo "Prints the number of frames of data in the data-dir"
+  ) 1>&2
+fi
+
+data=$1
+
+if [ ! -f $data/utt2dur ]; then
+  utils/data/get_utt2dur.sh $data 1>&2 || exit 1
+fi
+
+frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
+
+awk -v s=$frame_shift '{n += $2} END{print int(n / s)}' <$data/utt2dur
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
new file mode 100755
index 00000000000..344eb773581
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a data directory, such as in data/train/, and adds the
+# utt2dur file if it does not already exist.  The file 'utt2dur' maps from
+# utterance to the duration of the utterance in seconds.  This script works it
+# out from the 'segments' file, or, if not present, from the wav.scp file (it
+# first tries interrogating the headers, and if this fails, it reads the wave
+# files in entirely.)
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  exit 1
+fi
+
+export LC_ALL=C
+
+data=$1
+
+
+if [ -f $data/segments ]; then
+  echo "$0: working out $data/utt2dur from $data/segments"
+  cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur
+else
+  echo "$0: segments file does not exist so getting durations from wave files"
+  if [ ! -f $data/wav.scp ]; then
+    echo "$0: Expected $data/wav.scp or $data/segments to exist"
+    exit 1
+  fi
+
+  # if the wav.scp contains only lines of the form
+  # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
+  if cat $data/wav.scp | perl -e '
+     while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
+             @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
+                               $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
+             $utt = $A[0]; $sphere_file = $A[4];
+             if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
+             $sample_rate = -1;  $sample_count = -1;
+             for ($n = 0; $n <= 30; $n++) {
+                $line = <F>;
+                if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
+                if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
+                if ($line =~ m/end_head/) { break; }
+             }
+             close(F);
+             if ($sample_rate == -1 || $sample_count == -1) {
+               die "could not parse sphere header from $sphere_file";
+             }
+             $duration = $sample_count * 1.0 / $sample_rate;
+             print "$utt $duration\n";
+     } ' > $data/utt2dur; then
+    echo "$0: successfully obtained utterance lengths from sphere-file headers"
+  else
+    echo "$0: could not get utterance lengths from sphere-file headers, using wav-to-duration"
+    if ! command -v wav-to-duration >/dev/null; then
+      echo  "$0: wav-to-duration is not on your path"
+      exit 1;
+    fi
+    if ! wav-to-duration scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then
+      echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/"
+      mkdir -p $data/.backup/
+      mv $data/utt2dur $data/.backup/
+    fi
+  fi
+fi
+
+len1=$(cat $data/utt2spk | wc -l)
+len2=$(cat $data/utt2dur | wc -l)
+if [ "$len1" != "$len2" ]; then
+  echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1"
+fi
+
+echo "$0: computed $data/utt2dur"
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh
new file mode 120000
index 00000000000..1cd5db30d92
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh
@@ -0,0 +1 @@
+../perturb_data_dir_speed.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
new file mode 100755
index 00000000000..a5a030ffdd8
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+
+# Apache 2.0
+
+# This script does the standard 3-way speed perturbing of
+# a data directory (it operates on the wav.scp).
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: perturb_data_dir_speed_3way.sh <srcdir> <destdir>"
+  echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1."
+  echo "e.g.:"
+  echo " $0 data/train data/train_sp"
+  echo "Note: if <destdir>/feats.scp already exists, this will refuse to run."
+  exit 1
+fi
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/wav.scp ]; then
+  echo "$0: expected $srcdir/wav.scp to exist"
+  exit 1
+fi
+
+if [ -f $destdir/feats.scp ]; then
+  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
+  exit 1
+fi
+
+utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
+utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
+utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+
+rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
+
+echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
+
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
new file mode 100755
index 00000000000..b7fb0cfce26
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a data directory, such as in data/train/, and modifies
+# the wav.scp to perturb the volume (typically useful for training data when
+# using systems that don't have cepstral mean normalization).
+
+. utils/parse_options.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  exit 1
+fi
+
+export LC_ALL=C
+
+data=$1
+
+if [ ! -f $data/wav.scp ]; then
+  echo "$0: Expected $data/wav.scp to exist"
+  exit 1
+fi
+
+if [ grep "sox --vol" $data/wav.scp ]; then
+  echo "$0: It looks like the data was already volume perturbed.  Not doing anything."
+  exit 0
+fi
+
+cat $data/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"  > $data/wav.scp_scaled || exit 1;
+
+len1=$(cat $data/wav.scp | wc -l)
+len2=$(cat $data/wav.scp_scaled | wc -l)
+if [ "$len1" != "$len2" ]; then
+  echo "$0: error detected: number of lines changed $len1 vs $len2";
+  exit 1
+fi
+
+mv $data/wav.scp_scaled $data/wav.scp
+
+if [ -f $data/feats.scp ]; then
+  echo "$0: $data/feats.scp exists; moving it to $data/.backup/ as it wouldn't be valid any more."
+  mkdir -p $data/.backup/
+  mv $data/feats.scp $data/.backup/
+fi
+
+echo "$0: added volume perturbation to the data in $data"
+exit 0
+
diff --git a/egs/wsj/s5/utils/data/validate_data_dir.sh b/egs/wsj/s5/utils/data/validate_data_dir.sh
new file mode 120000
index 00000000000..1e19b4d921f
--- /dev/null
+++ b/egs/wsj/s5/utils/data/validate_data_dir.sh
@@ -0,0 +1 @@
+../validate_data_dir.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 4716925df7d..b6ce1511814 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
-# This script makes sure that only the segments present in 
+# This script makes sure that only the segments present in
 # all of "feats.scp", "wav.scp" [if present], segments [if present]
 # text, and utt2spk are present in any of them.
-# It puts the original contents of data-dir into 
+# It puts the original contents of data-dir into
 # data-dir/.backup
 
 if [ $# != 1 ]; then
@@ -35,7 +35,8 @@ function check_sorted {
   fi
 }
 
-for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang utt2uniq; do
+for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x
@@ -61,7 +62,7 @@ function filter_file {
 function filter_recordings {
   # We call this once before the stage when we filter on utterance-id, and once
   # after.
-  
+
   if [ -f $data/segments ]; then
   # We have a segments file -> we need to filter this and the file wav.scp, and
   # reco2file_and_utt, if it exists, to make sure they have the same list of
@@ -78,7 +79,7 @@ function filter_recordings {
     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
     mv $tmpdir/recordings.tmp $tmpdir/recordings
 
-    
+
     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
     filter_file $tmpdir/recordings $data/segments
     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
@@ -86,7 +87,7 @@ function filter_recordings {
 
     filter_file $tmpdir/recordings $data/wav.scp
     [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
-    
+
   fi
 }
 
@@ -116,8 +117,6 @@ function filter_speakers {
 function filter_utts {
   cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
 
-# Do a check.
-
   ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
     echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
 
@@ -128,7 +127,7 @@ function filter_utts {
   ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
     echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
 
-  if [ -f $data/utt2uniq ]; then 
+  if [ -f $data/utt2uniq ]; then
     ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
       echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
   fi
@@ -155,7 +154,7 @@ function filter_utts {
     fi
   fi
 
-  for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang $maybe_wav; do
+  for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur $maybe_wav; do
     if [ -f $data/$x ]; then
       cp $data/$x $data/.backup/$x
       if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 7753c186045..7b5477e958a 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -71,8 +71,8 @@ if [ -z $loc ]; then
     export PATH=$PATH:$sdir:$sdir/..
   else
     echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  See tools/install_srilm.sh for installation
-    echo instructions.
+    echo or installed in $sdir.  cd to ../../../tools and run
+    echo extras/install_srilm.sh.
     exit 1
   fi
 fi
@@ -88,8 +88,8 @@ lm_base=$(basename $lm '.gz')
 gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
   > $out_dir/oovs_${lm_base}.txt || exit 1;
 
-# Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-# occur only at being/end of utt.  These can cause determinization failures 
+# Removing all "illegal" combinations of <s> and </s>, which are supposed to
+# occur only at being/end of utt.  These can cause determinization failures
 # of CLG [ends up being epsilon cycles].
 gunzip -c $lm \
   | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
@@ -98,8 +98,8 @@ gunzip -c $lm \
 awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;
 
 # Change the LM vocabulary to be the intersection of the current LM vocabulary
-# and the set of words in the pronunciation lexicon. This also renormalizes the 
-# LM by recomputing the backoff weights, and remove those ngrams whose 
+# and the set of words in the pronunciation lexicon. This also renormalizes the
+# LM by recomputing the backoff weights, and remove those ngrams whose
 # probabilities are lower than the backed-off estimates.
 change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \
   $srilm_opts || exit 1;
diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
new file mode 120000
index 00000000000..2d1d4425b49
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
@@ -0,0 +1 @@
+../add_lex_disambig.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl
new file mode 100755
index 00000000000..aa0e6eb1c78
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/check_g_properties.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+if (@ARGV != 1) {
+  print "Usage: $0 [options] <lang_directory>\n";
+  print "e.g.:  $0 data/lang\n";
+  exit(1);
+}
+
+$lang = shift @ARGV;
+
+# This script checks that G.fst in the lang.fst directory is OK with respect
+# to certain expected properties, and returns nonzero exit status if a problem was
+# detected.  It is called from validate_lang.pl.
+# This only checks the properties of G that relate to disambiguation symbols,
+# epsilons and forbidden symbols <s> and </s>.
+
+if (! -e "$lang/G.fst") {
+  print "$0: error: $lang/G.fst does not exist\n";
+  exit(1);
+}
+
+open(W, "<$lang/words.txt") || die "opening $lang/words.txt";
+$hash_zero = -1;
+while (<W>) {
+  @A = split(" ", $_);
+  ($sym, $int) = @A;
+  if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
+  if ($sym eq "#0") { $hash_zero = $int; }
+}
+
+if (-e "$lang/phones/wdisambig_words.int") {
+  open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int";
+  while (<F>) {
+    chop;
+    $is_disambig{$_} = 1;
+  }
+} else {
+  $is_disambig{$hash_zero} = 1;
+}
+
+$input_cmd = ". ./path.sh; fstprint $lang/G.fst|";
+open(G, $input_cmd) || die "running command $input_cmd";
+
+$info_cmd = ". ./path.sh; fstcompile | fstinfo ";
+open2(O, I, "$info_cmd") || die "running command $info_cmd";
+
+$has_epsilons = 0;
+
+while (<G>) {
+  @A = split(" ", $_);
+  if (@A >= 4) {
+    if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s>\n";
+      exit(1);
+    } elsif ($is_disambig{$A[2]}) {
+      print O $_;
+      if ($A[3] != 0) {
+        chop;
+        print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n";
+        exit(1);
+      }
+    } elsif ($A[2] == 0) {
+      print O $_;
+      $has_epsilons = 1;
+    } elsif ($A[2] != $A[3]) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n";
+      exit(1);
+    }
+  }
+}
+
+close(O);  # tell 'fstcompile | fstinfo' pipeline that its input is done.
+while (<I>) {
+  if (m/cyclic\s+/) {
+    print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons.  Would cause determinization failure\n";
+    exit(1);
+  }
+}
+
+if ($has_epsilons) {
+  print "$0: warning: validating $lang: G.fst has epsilon-input arcs.  We don't expect these in most setups.\n";
+}
+
+print "--> $0 successfully validated $lang/G.fst\n";
+exit(0);
diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh
new file mode 120000
index 00000000000..96b9f592e82
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/prepare_lang.sh
@@ -0,0 +1 @@
+../prepare_lang.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl
new file mode 120000
index 00000000000..edb66bf3149
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/validate_lang.pl
@@ -0,0 +1 @@
+../validate_lang.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/make_phone_bigram_lang.sh
index 87d1db8f3e8..a8a67870fb3 100755
--- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh
+++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh
@@ -11,7 +11,7 @@
 # language-id.
 
 
-# We might later have options here; if not, I'llr emove this.
+# We might later have options here; if not, I'll emove this.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -42,6 +42,8 @@ rm -r $lang_out/phones 2>/dev/null
 cp -r $lang/phones/ $lang_out/
 rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
   # no longer be valid.
+rm $lang_out/phones/wdisambig* 2>/dev/null  # ditto this.
+
 # List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
 # are determinizable without any.
 echo -n > $lang_out/phones/disambig.txt
@@ -81,7 +83,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
   foreach $p (@phones) {
     $src = $phn2state{$p};
     $hist = $histcount{$p};
-    $hist > 0 || die;    
+    $hist > 0 || die;
     foreach $q (@phones) {
       $c = $count{$p,$q};
       if (defined $c) {
@@ -92,7 +94,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
     }
     $c = $count{$p,"</s>"};
     if (defined $c) {
-      $cost = -log($c / $hist); # cost on FST arc.      
+      $cost = -log($c / $hist); # cost on FST arc.
       print "$src $cost\n"; # final-prob.
     }
   } ' | fstcompile --acceptor=true | \
@@ -101,7 +103,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
 # symbols for phones and words are the same.
 # Neither has disambig symbols.
 cp $lang_out/phones.txt $lang_out/words.txt
-  
+
 grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
    fstcompile  > $lang_out/L.fst
 
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index 61c0962cf15..091ea0c069e 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 #           2014  Tom Ko
@@ -36,7 +36,7 @@ which sox &>/dev/null
 ! [ $? -eq 0 ] && echo "sox: command not found" && exit 1;
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "$0: no such file $srcdir/utt2spk" 
+  echo "$0: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -65,18 +65,18 @@ if [ -f $srcdir/segments ]; then
 
   utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     awk -v factor=$factor \
-        '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} 
+        '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
           else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
   if [ -f $srcdir/reco2file_and_channel ]; then
     utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
   fi
-  
+
   rm $destdir/reco_map 2>/dev/null
 else # no segments->wav indexed by utterance.
   if [ -f $srcdir/wav.scp ]; then
     utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
      awk -v factor=$factor \
-       '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} 
+       '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
          else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
   fi
 fi
@@ -88,6 +88,10 @@ if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
 
+if [ -f $srcdir/utt2dur ]; then
+  cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map  | \
+    awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur
+fi
 
 rm $destdir/spk_map $destdir/utt_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir"
diff --git a/egs/wsj/s5/utils/perturb_data_signal.sh b/egs/wsj/s5/utils/perturb_data_signal.sh
new file mode 100755
index 00000000000..7034dd22d5b
--- /dev/null
+++ b/egs/wsj/s5/utils/perturb_data_signal.sh
@@ -0,0 +1,149 @@
+#!/bin/bash 
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+#           2014  Tom Ko
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+#  spk_filter.scp
+# It generates the files which are used for perturbing the data at signal-level.
+
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: perturb_data_signal.sh <prefix> <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 'fp01' data/train_si284 data/train_si284p"
+  exit 1
+fi
+
+export LC_ALL=C
+
+prefix=$1
+srcdir=$2
+destdir=$3
+spk_prefix=$prefix"-"
+utt_prefix=$prefix"-"
+
+for f in spk2utt text utt2spk wav.scp spk_filter.scp; do
+  [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1;
+done
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+
+# The following perl script is the core part.
+
+echo $spk_prefix | perl -e '
+  $prefix = <STDIN>;
+  chomp($prefix);
+  ($u2s_in, $seg_in, $wav_in, $filt_in, $wav_out) = @ARGV;
+  if (open(SEG, "<$seg_in")) {
+    $have_segments="true";
+  } else {
+    $have_segments="false";
+  }
+  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
+  open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n";
+  open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n";
+  open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n";
+  while (<UI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    ($utt_id, $spk) = @col;
+    $utt2spk{$utt_id} = $spk;
+  }
+  if ($have_segments eq "true") {
+    while (<SEG>) {
+      chomp;
+      @col = split;
+      $reco2utt{$col[1]} = $col[0];
+    }
+  }
+  while (<WI>) {
+    chomp;
+    @col = split;
+    $pipe = join(" ", @col[1..@col-1]);
+    $reco2pipe{$col[0]} = $pipe;
+    $recolist{$col[0]} = $col[0];
+    if ($have_segments eq "false") {
+      $reco2utt{$col[0]} = $col[0];
+    }
+  }
+  while (<FI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    $spk2filt{$col[0]} = $col[1];
+  }
+
+  foreach $reco (sort keys %recolist) {
+    #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}};
+    #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}};
+    $reco2spk{$reco} = $reco;
+    $reco2filt{$reco} = $spk2filt{$reco};
+    if ($reco2filt{$reco} eq "") {
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2spk{$reco} = $spk;
+      $reco2filt{$reco} = $spk2filt{$spk};
+    }
+    while (1) {
+      # randomly pick a filter from another speaker
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2perturbspk{$reco} = $spk;
+      $reco2perturbfilt{$reco} = $spk2filt{$spk};
+      if ($reco2perturbfilt{$reco} ne $reco2filt{$reco}) {
+        last;
+      }
+    }
+  }
+
+  foreach $reco (sort keys %recolist) {
+    print WO "$prefix$reco $reco2pipe{$reco} apply-filter --inverse=false \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $reco2perturbspk{$reco} $reco2perturbfilt{$reco} |\" - - |\n";
+  }
+
+' $srcdir/utt2spk $srcdir/segments $srcdir/wav.scp \
+$srcdir/spk_filter.scp $destdir/wav.scp
+
+if [ -f $srcdir/segments ]; then
+  # also apply the spk_prefix to the recording-ids.
+  cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map
+
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments
+
+  if [ -f $srcdir/reco2file_and_channel ]; then
+    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+  fi
+  
+  rm $destdir/reco_map 2>/dev/null
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+
+
+rm $destdir/spk_map $destdir/utt_map 2>/dev/null
+echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
diff --git a/egs/wsj/s5/utils/perturb_data_signal_v2.sh b/egs/wsj/s5/utils/perturb_data_signal_v2.sh
new file mode 100755
index 00000000000..c205b67e5e0
--- /dev/null
+++ b/egs/wsj/s5/utils/perturb_data_signal_v2.sh
@@ -0,0 +1,187 @@
+#!/bin/bash 
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+#           2014  Tom Ko
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  wav.scp
+#  spk2utt
+#  utt2spk
+#  text
+#  spk_filter.scp
+# It generates the files which are used for perturbing the data at signal-level.
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+  echo "Usage: perturb_data_signal.sh <num-parts> <prefix> <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 3 'fp01' data/train_si284 data/train_si284p"
+  exit 1
+fi
+
+export LC_ALL=C
+
+num_parts=$1
+prefix=$2
+srcdir=$3
+destdir=$4
+spk_prefix=$prefix"-"
+utt_prefix=$prefix"-"
+
+for f in spk2utt text utt2spk wav.scp spk_filter.scp; do
+  [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1;
+done
+
+set -e;
+set -o pipefail
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map
+cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+
+# The following perl script is the core part.
+
+echo $spk_prefix | perl -e '
+  $prefix = <STDIN>;
+  chomp($prefix);
+  ($num_parts, $u2s_in, $s2u_in, $seg_in, $wav_in, $filt_in, $wav_out, $seg_out) = @ARGV;
+  if (open(SEG, "<$seg_in")) {
+    $have_segments="true";
+  } else {
+    $have_segments="false";
+  }
+  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
+  open(SI, "<$s2u_in") || die "Error: fail to open $s2u_in\n";
+  open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n";
+  open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n";
+  open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n";
+  open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n";
+  while (<UI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    ($utt_id, $spk) = @col;
+    $utt2spk{$utt_id} = $spk;
+  }
+  while (<SI>) {
+    chomp;
+    @col = split;
+    $spks = join(" ", @col[1..@col-1]);
+    $spk2utt{$col[0]} = $spks;
+  }
+  if ($have_segments eq "true") {
+    while (<SEG>) {
+      chomp;
+      @col = split;
+      $seg = join(" ", @col[2..@col-1]);
+      $reco2utt{$col[1]} = $col[0];
+      $utt2reco{$col[0]} = $col[1];
+      $utt2seg{$col[0]} = $seg;
+    }
+  }
+  while (<WI>) {
+    chomp;
+    @col = split;
+    $pipe = join(" ", @col[1..@col-1]);
+    $reco2pipe{$col[0]} = $pipe;
+    $recolist{$col[0]} = $col[0];
+    if ($have_segments eq "false") {
+      $reco2utt{$col[0]} = $col[0];
+    }
+  }
+  while (<FI>) {
+    chomp;
+    @col = split;
+    @col == 2 || die "Error: bad line $_\n";
+    $spk2filt{$col[0]} = $col[1];
+  }
+
+  foreach $reco (sort keys %recolist) {
+    #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}};
+    #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}};
+    $reco2spk{$reco} = $reco;
+    $reco2filt{$reco} = $spk2filt{$reco};
+    for (my $i=0; $i < $num_parts; $i++) {
+      $newreco2spk{$reco.$i} = $reco;
+    }
+    @spk2filt_rand{keys %spk2filt} = @spk2filt{keys %spk2filt};
+    delete $spk2filt_rand{$reco};
+    if ($reco2filt{$reco} eq "") {
+      $spk = (keys %spk2filt)[rand keys %spk2filt];
+      $reco2spk{$reco} = $spk;
+      $reco2filt{$reco} = $spk2filt{$spk};
+      delete $spk2filt_rand{$spk};
+    }
+    for (my $i=0; $i < $num_parts; $i++) {
+      # randomly pick a filter from another speaker
+      $spk = (keys %spk2filt_rand)[rand keys %spk2filt_rand];
+      $newreco2perturbspk{$reco.$i} = $spk;
+      $newreco2perturbfilt{$reco.$i} = $spk2filt{$spk};
+      delete $spk2filt_rand{$spk};
+    }
+  }
+
+  foreach $spk (sort keys %spk2utt) {
+    @utts = split(" ", $spk2utt{$spk});
+    $numutts = @utts;
+    if ($numutts < $num_parts) {
+      $partsize = $numutts;
+    } else {
+      $partsize = $numutts / $num_parts;
+    }
+    for (my $i=0; $i < $numutts; $i++) {
+      $partid = int($i / $partsize);
+      $utt = $utts[$i];
+      $filled = sprintf "%02d", $partid;
+      print SO "$prefix$utt $prefix$utt2reco{$utt}-$filled $utt2seg{$utt}\n";
+      $newrecolist{"$prefix$utt2reco{$utt}-$filled"} = "$prefix$utt2reco{$utt}-$filled";
+    }
+  }
+
+  foreach $reco (sort keys %recolist) {
+    for (my $i=0; $i < $num_parts; $i++) {
+      $filled = sprintf "%02d", $i;
+      if ($newrecolist{"$prefix$reco-$filled"} ne "") {
+        print WO "$prefix$reco-$filled $reco2pipe{$reco} apply-filter \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $newreco2perturbspk{$reco.$i} $newreco2perturbfilt{$reco.$i} |\" - - |\n";
+      }
+    }
+  }
+
+' $num_parts $srcdir/utt2spk $srcdir/spk2utt $srcdir/segments $srcdir/wav.scp \
+$srcdir/spk_filter.scp $destdir/wav.scp $destdir/segments
+
+if [ -f $srcdir/segments ]; then
+  # also apply the spk_prefix to the recording-ids.
+  cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map
+
+#  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments
+
+#  if [ -f $srcdir/reco2file_and_channel ]; then
+#    utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+#  fi
+  
+  rm $destdir/reco_map 2>/dev/null
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/spk2gender ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
+fi
+
+
+rm $destdir/spk_map $destdir/utt_map 2>/dev/null
+echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index 43b8bce1f4c..0014f22a04e 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -28,20 +28,21 @@
 # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
 # and extra_questions.txt
 # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
-# non-silence phones respectively (where silence includes various kinds of 
-# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the 
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
 # "real" phones.)
-# In each line of those files is a list of phones, and the phones on each line 
-# are assumed to correspond to the same "base phone", i.e. they will be 
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
 # different stress or tone variations of the same basic phone.
-# The file "optional_silence.txt" contains just a single phone (typically SIL) 
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
 # which is used for optional silence in the lexicon.
 # extra_questions.txt might be empty; typically will consist of lists of phones,
-# all members of each list with the same stress or tone; and also possibly a 
-# list for the silence phones.  This will augment the automtically generated 
-# questions (note: the automatically generated ones will treat all the 
-# stress/tone versions of a phone the same, so will not "get to ask" about 
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
 # stress or tone).
+#
 
 # This script adds word-position-dependent phones and constructs a host of other
 # derived files, that go in data/lang/.
@@ -49,19 +50,20 @@
 # Begin configuration section.
 num_sil_states=5
 num_nonsil_states=3
+num_word_disambig_syms=1
 position_dependent_phones=true
-# position_dependent_phones is false also when position dependent phones and word_boundary.txt 
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
 # have been generated by another source
 reverse=false
-share_silence_phones=false  # if true, then share pdfs of different silence 
+share_silence_phones=false  # if true, then share pdfs of different silence
                             # phones together.
 sil_prob=0.5
 phone_symbol_table=              # if set, use a specified phones.txt file.
 # end configuration sections
 
-. utils/parse_options.sh 
+. utils/parse_options.sh
 
-if [ $# -ne 4 ]; then 
+if [ $# -ne 4 ]; then
   echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
   echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
   echo "<dict-src-dir> should contain the following files:"
@@ -133,10 +135,10 @@ if $position_dependent_phones; then
   # adding the markers _B, _E, _S, _I depending on word position.
   # In this recipe, these markers apply to silence also.
   # Do this starting from lexiconp.txt only.
-  if "$silprob"; then 
+  if "$silprob"; then
     perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
               $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
-         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } 
+         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
          else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
          for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                 < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
@@ -158,11 +160,11 @@ if $position_dependent_phones; then
       mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt
     fi
   fi
-  
+
   # create $tmpdir/phone_map.txt
   # this has the format (on each line)
   # <original phone> <version 1 of original phone> <version 2> ...
-  # where the versions depend on the position of the phone within a word. 
+  # where the versions depend on the position of the phone within a word.
   # For instance, we'd have:
   # AA AA_B AA_E AA_I AA_S
   # for (B)egin, (E)nd, (I)nternal and (S)ingleton
@@ -174,11 +176,11 @@ if $position_dependent_phones; then
   # This phone map expands the phone lists into all the word-position-dependent
   # versions of the phone lists.
 
-  cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
-    <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
     > $tmpdir/phone_map.txt
 else
-  if "$silprob"; then 
+  if "$silprob"; then
     cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
     if $reverse; then
       echo "We do not support reverse option and silprob at the same time"
@@ -245,10 +247,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m
 # be inside a word.
 if $position_dependent_phones; then
   for suffix in _B _E _I _S; do
-    (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
   for suffix in "" _B _E _I _S; do
-    (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
 fi
 
@@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then
   start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
   BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
-    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt 
+    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
 else
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
     awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
@@ -313,7 +315,7 @@ fi
 cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
   BEGIN {
     print "<eps> 0";
-  } 
+  }
   {
     if ($1 == "<s>") {
       print "<s> is in the vocabulary!" | "cat 1>&2"
@@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
   utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
 
 # Create the basic L.fst without disambiguation symbols, for use
-# in training. 
+# in training.
 
 if $silprob; then
   # Usually it's the same as having a fixed-prob L.fst
@@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
 # integer version of oov symbol, used in some scripts.
 
 
-# Create these lists of phones in colon-separated integer list form too, 
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
 # for purposes of being given to programs as command-line options.
 for f in silence nonsilence optional_silence disambig context_indep; do
   utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
@@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel
 # Create the lexicon FST with disambiguation symbols, and put it in lang_test.
 # There is an extra step where we create a loop to "pass through" the
 # disambiguation symbols from G.fst.
-phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
 
 if $silprob; then
   utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 else
   utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 fi
 
diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl
index 68c269080ac..8095272732e 100755
--- a/egs/wsj/s5/utils/slurm.pl
+++ b/egs/wsj/s5/utils/slurm.pl
@@ -11,7 +11,7 @@
 use Cwd;
 use Getopt::Long;
 
-# slurm.pl was created from the queue.pl 
+# slurm.pl was created from the queue.pl
 # queue.pl has the same functionality as run.pl, except that
 # it runs the job in question on the queue (Sun GridEngine).
 # This version of queue.pl uses the task array functionality
@@ -20,7 +20,7 @@
 
 # The script now supports configuring the queue system using a config file
 # (default in conf/queue.conf; but can be passed specified with --config option)
-# and a set of command line options. 
+# and a set of command line options.
 # The current script handles:
 # 1) Normal configuration arguments
 # For e.g. a command line option of "--gpu 1" could be converted into the option
@@ -30,7 +30,7 @@
 # $0 here in the line is replaced with the argument read from the CLI and the
 # resulting string is passed to qsub.
 # 2) Special arguments to options such as
-# gpu=0 
+# gpu=0
 # If --gpu 0 is given in the command line, then no special "-q" is given.
 # 3) Default argument
 # default gpu=0
@@ -60,7 +60,7 @@
 my $qsub_opts = "";
 my $sync = 0;
 my $num_threads = 1;
-my $max_jobs_run;  
+my $max_jobs_run;
 my $gpu = 0;
 
 my $config = "conf/slurm.conf";
@@ -99,12 +99,12 @@ ()
   print_usage();
 }
 
-for (my $x = 1; $x <= 3; $x++) { # This for-loop is to 
+for (my $x = 1; $x <= 3; $x++) { # This for-loop is to
   # allow the JOB=1:n option to be interleaved with the
   # options to qsub.
   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
     my $switch = shift @ARGV;
-    
+
     if ($switch eq "-V") {
       $qsub_opts .= "-V ";
     } else {
@@ -121,10 +121,10 @@ ()
         $num_threads = $argument2;
       } elsif ($switch =~ m/^--/) { # Config options
         # Convert CLI option to variable name
-        # by removing '--' from the switch and replacing any 
+        # by removing '--' from the switch and replacing any
         # '-' with a '_'
         $switch =~ s/^--//;
-        $switch =~ s/-/_/g;         
+        $switch =~ s/-/_/g;
         $cli_options{$switch} = $argument;
       } else {  # Other qsub options - passed as is
         $qsub_opts .= "$switch $argument ";
@@ -160,7 +160,7 @@ ()
 
 if (exists $cli_options{"config"}) {
   $config = $cli_options{"config"};
-}  
+}
 
 my $default_config_file = <<'EOF';
 # Default configuration
@@ -168,17 +168,18 @@ ()
 option time=* --time $0
 option mem=* --mem-per-cpu $0
 option mem=0          # Do not add anything to qsub_opts
-option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
-option max_jobs_run=*     # Do nothing
 default gpu=0
 option gpu=0 -p shared
 option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
 EOF
 
 # Here the configuration options specified by the user on the command line
 # (e.g. --mem 2G) are converted to options to the qsub system as defined in
-# the config file. (e.g. if the config file has the line 
+# the config file. (e.g. if the config file has the line
 # "option mem=* -l ram_free=$0,mem_free=$0"
 # and the user has specified '--mem 2G' on the command line, the options
 # passed to queue system would be "-l ram_free=2G,mem_free=2G
@@ -192,7 +193,7 @@ ()
 my %cli_config_options = ();
 my %cli_default_options = ();
 
-if ($opened_config_file == 0 && exists($cli_options{"config"})) {   
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
   print STDERR "Could not open config file $config\n";
   exit(1);
 } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
@@ -212,12 +213,12 @@ ()
   if ($_ =~ /^command (.+)/) {
     $read_command = 1;
     $qsub_cmd = $1 . " ";
-  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { 
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
     # Config option that needs replacement with parameter value read from CLI
     # e.g.: option mem=* -l mem_free=$0,ram_free=$0
     my $option = $1;     # mem
     my $arg= $2;         # -l mem_free=$0,ram_free=$0
-    if ($arg !~ m:\$0:) {  
+    if ($arg !~ m:\$0:) {
       print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n";
     }
     if (exists $cli_options{$option}) {
@@ -237,7 +238,7 @@ ()
     }
   } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
     # Default options. Used for setting default values to options i.e. when
-    # the user does not specify the option on the command line 
+    # the user does not specify the option on the command line
     # e.g. default gpu=0
     my $option = $1;  # gpu
     my $value = $2;   # 0
@@ -261,19 +262,25 @@ ()
 
 for my $option (keys %cli_options) {
   if ($option eq "config") { next; }
-  if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; }
+
   my $value = $cli_options{$option};
-  
-  if ($option eq "max_jobs_run") { $max_jobs_run = $value; }
 
-  if (exists $cli_default_options{($option,$value)}) {
+  if ($option eq "max_jobs_run") {
+    if ($array_job != 1) {
+      print STDERR "Ignoring $option since this is not an array task.";
+    } else {
+      $max_jobs_run = $value;
+    }
+  } elsif (exists $cli_default_options{($option,$value)}) {
     $qsub_opts .= "$cli_default_options{($option,$value)} ";
   } elsif (exists $cli_config_options{$option}) {
     $qsub_opts .= "$cli_config_options{$option} ";
   } elsif (exists $cli_default_options{($option,"*")}) {
     $qsub_opts .= $cli_default_options{($option,"*")} . " ";
   } else {
-    if ($opened_config_file == 0) { $config = "default config file"; }
+    if ($opened_config_file == 0) {
+      $config = "default config file";
+    }
     die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n";
   }
 }
@@ -301,7 +308,7 @@ ()
 #
 my $cmd = "";
 
-foreach my $x (@ARGV) { 
+foreach my $x (@ARGV) {
   if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                             # as-is.
   elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
@@ -322,23 +329,23 @@ ()
 # make a directory called "q",
 # where we will put the log created by qsub... normally this doesn't contain
 # anything interesting, evertyhing goes to $logfile.
-if (! -d "$qdir") { 
+if (! -d "$qdir") {
   system "mkdir $qdir 2>/dev/null";
   sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
   ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
   ## created and the job immediately ran, it would die with an error because nfs
   ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
   ## NFS settings to something like 5 seconds.
-} 
+}
 
 my $queue_array_opt = "";
 if ($array_job == 1) { # It's an array job.
   if ($max_jobs_run) {
-      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}";
   } else {
-      $queue_array_opt = "--array ${jobstart}-${jobend}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}";
   }
-  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get 
+  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get
   # replaced by qsub, in each job, with the job-id.
   $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command...
   $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
@@ -475,14 +482,14 @@ ()
         }
       }
 
-      # Check that the job exists in SLURM. Job can be killed if duration 
-      # exceeds some hard limit, or in case of a machine shutdown. 
+      # Check that the job exists in SLURM. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
       if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
         if ( -f $f ) { next; }; #syncfile appeared: OK.
         $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null");
         # system(...) : To get the actual exit value, shift $ret right by eight bits.
         if ($ret>>8 == 1) {     # Job does not seem to exist
-          # Don't consider immediately missing job as error, first wait some  
+          # Don't consider immediately missing job as error, first wait some
           # time to make sure it is not just delayed creation of the syncfile.
 
           sleep(3);
@@ -546,7 +553,7 @@ ()
   push @logfiles, $logfile;
 } else {
   for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-    my $l = $logfile; 
+    my $l = $logfile;
     $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g;
     push @logfiles, $l;
   }
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 63f8bdbf3b9..19452c3c235 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -133,7 +133,7 @@ if [ -f $data/wav.scp ]; then
     ! cat $data/segments | \
       awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
       echo "$0: badly formatted segments file" && exit 1;
-    
+
     segments_len=`cat $data/segments | wc -l`
     if [ -f $data/text ]; then
       ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \
@@ -153,14 +153,14 @@ if [ -f $data/wav.scp ]; then
       # this file is needed only for ctm scoring; it's indexed by recording-id.
       check_sorted_and_uniq $data/reco2file_and_channel
       ! cat $data/reco2file_and_channel | \
-        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                 if ( NF == 3 && $3 == "1" ) {
                   warning_issued = 1;
                 } else {
-                  print "Bad line ", $0; exit 1; 
+                  print "Bad line ", $0; exit 1;
                 }
               }
-            } 
+            }
             END {
               if (warning_issued == 1) {
                 print "The channel should be marked as A or B, not 1! You should change it ASAP! "
@@ -188,14 +188,14 @@ if [ -f $data/wav.scp ]; then
       # this file is needed only for ctm scoring; it's indexed by recording-id.
       check_sorted_and_uniq $data/reco2file_and_channel
       ! cat $data/reco2file_and_channel | \
-        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                 if ( NF == 3 && $3 == "1" ) {
                   warning_issued = 1;
                 } else {
-                  print "Bad line ", $0; exit 1; 
+                  print "Bad line ", $0; exit 1;
                 }
               }
-            } 
+            }
             END {
               if (warning_issued == 1) {
                 print "The channel should be marked as A or B, not 1! You should change it ASAP! "
@@ -228,6 +228,7 @@ if [ -f $data/feats.scp ]; then
   fi
 fi
 
+
 if [ -f $data/cmvn.scp ]; then
   check_sorted_and_uniq $data/cmvn.scp
   cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
@@ -294,4 +295,19 @@ for f in vad.scp utt2lang utt2uniq; do
   fi
 done
 
+
+if [ -f $data/utt2dur ]; then
+  check_sorted_and_uniq $data/utt2dur
+  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
+  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.feats}
+    exit 1;
+  fi
+  cat $data/utt2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+fi
+
+
 echo "$0: Successfully validated data-directory $data"
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index ae087bd9578..f9a27584b07 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -22,6 +22,7 @@
   exit(1);
 }
 
+print "$0 " . join(" ", @ARGV) . "\n";
 
 $lang = shift @ARGV;
 $exit = 0;
@@ -89,15 +90,7 @@
     $wint2sym{$wsymtab{$_}} = $_;
   }
 }
-if (exists $wsymtab{"#0"}) {
-  print "--> $lang/words.txt has \"#0\"\n";
-  print "--> $lang/words.txt is OK\n";
-} else {
-  $warning = 1;
-  print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
-  print "-->          (if you are using ARPA-type language models, you will normally\n";
-  print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
-}
+print "--> $lang/words.txt is OK\n";
 print "\n";
 
 # Checking phones/* -------------------------------
@@ -113,7 +106,6 @@ sub check_txt_int_csl {
   if (!open(CSL, "<$cat.csl")) {
     $exit = 1; return print "--> ERROR: fail to open $cat.csl\n";
   }
-
   if (-z "$cat.txt") {
     $warning = 1; print "--> WARNING: $cat.txt is empty\n";
   }
@@ -743,6 +735,76 @@ sub check_summation {
   }
 }
 
+sub check_wdisambig {
+  print "Checking word-level disambiguation symbols...\n";
+  # This block checks that one of the two following conditions hold:
+  # (1) for lang diretories prepared by older versions of prepare_lang.sh:
+  #  The symbol  '#0' should appear in words.txt and phones.txt, and should
+  # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int
+  #  exist, and have the expected properties (see below for details).
+  my %wdisambig_words_hash;
+  my %wdisambig_words_string = "";
+
+  if (! -e "$lang/phones/wdisambig.txt") {
+    print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n";
+    if (exists $wsymtab{"#0"}) {
+      print "--> $lang/words.txt has \"#0\"\n";
+      $wdisambig_words_hash{$wsymtab{"#0"}} = 1;
+      $wdisambig_words_string = $wsymtab{"#0"};
+    } else {
+      print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
+      print "-->          (if you are using ARPA-type language models, you will normally\n";
+      print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
+    }
+  } else {
+   print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n";
+    if (!open(T, "<$lang/phones/wdisambig.txt")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig = <T>);
+    close(T);
+    if (!open(W, "<$lang/phones/wdisambig_words.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_words = <W>);
+    close(W);
+    if (!open(P, "<$lang/phones/wdisambig_phones.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_phones = <P>);
+    close(P);
+    my $len = @wdisambig, $len2;
+    if (($len2 = @wdisambig_words) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      $exit = 1; return;
+    }
+   if (($len2 = @wdisambig_phones) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      $exit = 1; return;
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    foreach my $i ( @wdisambig_words ) {
+      $wdisambig_words_hash{$i} = 1;
+      $wdisambig_words_string .= " " . $i;
+    }
+ }
+}
+
+check_wdisambig();
+
 if (-e "$lang/G.fst") {
   # Check that G.fst is ilabel sorted and nonempty.
   $text = `. ./path.sh; fstinfo $lang/G.fst`;
@@ -781,21 +843,17 @@ sub check_summation {
   }
 
   # Check that G.fst does not have cycles with only disambiguation symbols or
-  # epsilons on the input, or the forbidden symbols <s> and </s>.
-  $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline<disambig)>0) is_disambig[\$1]=1; is_disambig[0] = 1; while((getline<words)>0){ if(\$1==\"<s>\"||\$1==\"</s>\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol <s> or </s>\" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo ";
-  $output = `$cmd`;
-  if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)...
-    print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " .
-         "contained the forbidden symbols <s> or </s>, or possibly some other error..  Output was: \n";
-    print $output;
-    $exit = 1;
-  }
-  if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols.   This is now allowed.
-    print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input.  Would cause determinization failure in graph creation.\n";
-    $exit = 1;
-  } else {
-    print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" .
-      "the forbidden symbols <s> or </s> (if present in vocab) on the input or output.\n";
+  # epsilons on the input, or the forbidden symbols <s> and </s> (and a few
+  # related checks
+
+  if (-e "$lang/G.fst") {
+    system("utils/lang/check_g_properties.pl $lang");
+    if ($? != 0) {
+      print "--> ERROR: failure running check_g_properties.pl\n";
+      $exit = 1;
+    } else {
+      print("--> utils/lang/check_g_properties.pl succeeded.\n");
+    }
   }
 }
 
diff --git a/notes b/notes
deleted file mode 100644
index b0777bd71f4..00000000000
--- a/notes
+++ /dev/null
@@ -1,109 +0,0 @@
---
-TODO:
---
-
- Transition-model equivalent.
-
- chain::Topology
-
-   This stores the topology for each phone in the 'chain-model' modeling code.
-   It has a list of phones and allows you to get the topology FST for each phone.
-
-  A topology is an unweighted, epsilon-free acceptor FST [acceptor means the
-   ilabels and olabels are the same].  Its initial state must not be final.
-   The labels on the arcs must start from 1 and have no gaps-- i.e. they must
-   form a set like (1, 2) or (1, 2, 3).
-
-   An example FST would be (in OpenFst acceptor format),
-
-0   1   1      # transition from state 0 to state 1 with label 1.
-1   1   2      # transition from state 1 to state 1 (self-loop) with label 2.
-1   0          # this says that state 1 is final.
-
-
-  A Topology object is 'alignable' if all of the phones' topology FSTs
-  have the property that the set of labels on the arcs from the start state are
-  disjoint from the set of labels on other arcs, and there are no transitions to
-  the start state.  This means that we can identify the beginning of the phone.
-
-
- chain::PhoneContextModel
-   - list of phones
-   - LeftContext() ... the number of phones of left context (there is no right context).
-
- A mechanism to find a particular context-depenendent phone: you have to call the following
- LeftContext() + 1 times.
-
- // returns new state. if phone_in_context != NULL, outputs
- // the cd_phone_index to there (FST-wise, view this as the input symbol on the
- // transition, where 'phone' is the output symbool.
- int32 AdvanceState(int32 cur_state, int32 phone, int32 *cd_phone);
-
- Fst *GetFst();
-
- - Phone indexes may not include zero (and may have gaps)
-
- - cd_phoness are 1-based, without gaps.
-
- - We will initialize the PhoneContextModel using a tree for now, but in
-   future we may enable different ways of doing this.  We'll require that
-   the tree be trained using only one pdf-class.
-
-  TODO: enable tree-building with separate stats per state, but to give a single
-   index per phone.  [so store array of Gaussian stats].
-----
-
- To get labels for the individual transitions on the context-dependent phones,
- we need to store an offset for each
-
- chain::ContextDependentTopology  [note, you can use this even if you don't have
-   context].
-   stores Topology and PhoneContextModel, and also stores offsets for each context-dependent
-   phone that allow us to assign a unique context-dependent label for each label in the
-   cd-label
-
- cd-labels will be 1-based so they can appear on FSTs.  We may subtract one so they
- can appear at the output layer of a nnet.
----
-
- FST-based representation of phone language model??  We can get it from class
- LanguageModel as an FST, and then prune away disallowed phone sequences and
- rebalance.  The output can be in the standard FST representation.
-
- What do we do about initial alphas and final betas?  We want to limit it
- to the same states that are active in the phone lattice at that time, as a
- better approximation of the end effects.
-
- For the betas, it's just a question of what [context-independent] phones are
- active at the end-time.
-
-   We can limit it with reasonable acuracy by just considering the set of
-   symbols that are active at times 0 and T, and then limiting the alphas and
-   betas to the states from which those symbols would be emitted.
-   By time T we mean one past the end of the file.  We can store information
-   saying that either it's a final-prob, or just storing the active symbols
-   at that point.
-
-
-
-
-
----
-
-
- phones.  It will store the probs more compactly than OpenFst.
-
----
- Suppose we have 200 phones, and 500 history-states.
- If there are 5000 CD-phones, then each phone has on average 25 versions...
-
- Suppose for each history-state, the output-prob for a phone is just a
- combination of some subspace of output-probs for that phone.
-
-    num-params = 500 * 200 * 25 = 2.5 million.
-
---
-
-
-
-
diff --git a/src/Makefile b/src/Makefile
index 57a4b98e0aa..4fe95251b1e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -147,7 +147,7 @@ $(EXT_SUBDIRS) : mklibdir
 
 bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
  base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain
 
 #2)The libraries have inter-dependencies
 base:
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index 9629c5466ad..9311645cc0c 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -3,6 +3,7 @@
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
 //                      Jan Silovsky;   Yanmin Qian;
 //                      Johns Hopkins University (Author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -87,6 +88,112 @@ template<class T> inline void ReadBasicType(std::istream &is,
   }
 }
 
+// Template that covers integers.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v) {
+  // Compile time assertion that this is not called with a wrong type.
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  if (binary) {
+    char sz = sizeof(T);  // this is currently just a check.
+    os.write(&sz, 1);
+    int32 vecsz = static_cast<int32>(v.size());
+    KALDI_ASSERT((size_t)vecsz == v.size());
+    os.write(reinterpret_cast<const char *>(&vecsz), sizeof(vecsz));
+    if (vecsz != 0) {
+      os.write(reinterpret_cast<const char *>(&(v[0])), sizeof(T) * vecsz * 2);
+    }
+  } else {
+    // focus here is on prettiness of text form rather than
+    // efficiency of reading-in.
+    // reading-in is dominated by low-level operations anyway:
+    // for efficiency use binary.
+    os << "[ ";
+    typename std::vector<std::pair<T, T> >::const_iterator iter = v.begin(),
+                                                            end = v.end();
+    for (; iter != end; ++iter) {
+      if (sizeof(T) == 1)
+        os << static_cast<int16>(iter->first) << ','
+           << static_cast<int16>(iter->second) << ' ';
+      else
+        os << iter->first << ','
+           << iter->second << ' ';
+    }
+    os << "]\n";
+  }
+  if (os.fail()) {
+    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+  }
+}
+
+// Template that covers integers.
+template<class T> 
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v) {
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  KALDI_ASSERT(v != NULL);
+  if (binary) {
+    int sz = is.peek();
+    if (sz == sizeof(T)) {
+      is.get();
+    } else {  // this is currently just a check.
+      KALDI_ERR << "ReadIntegerPairVector: expected to see type of size "
+                << sizeof(T) << ", saw instead " << sz << ", at file position "
+                << is.tellg();
+    }
+    int32 vecsz;
+    is.read(reinterpret_cast<char *>(&vecsz), sizeof(vecsz));
+    if (is.fail() || vecsz < 0) goto bad;
+    v->resize(vecsz);
+    if (vecsz > 0) {
+      is.read(reinterpret_cast<char *>(&((*v)[0])), sizeof(T)*vecsz*2);
+    }
+  } else {
+    std::vector<std::pair<T, T> > tmp_v;  // use temporary so v doesn't use extra memory
+                           // due to resizing.
+    is >> std::ws;
+    if (is.peek() != static_cast<int>('[')) {
+      KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw "
+                << is.peek() << ", at file position " << is.tellg();
+    }
+    is.get();  // consume the '['.
+    is >> std::ws;  // consume whitespace.
+    while (is.peek() != static_cast<int>(']')) {
+      if (sizeof(T) == 1) {  // read/write chars as numbers.
+        int16 next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(',')) 
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::make_pair<T, T>((T)next_t1, (T)next_t2));
+      } else {
+        T next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(',')) 
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::make_pair<T, T>((T)next_t1, (T)next_t2));
+      }
+    }
+    is.get();  // get the final ']'.
+    *v = tmp_v;  // could use std::swap to use less temporary memory, but this
+    // uses less permanent memory.
+  }
+  if (!is.fail()) return;
+ bad:
+  KALDI_ERR << "ReadIntegerPairVector: read failure at file position "
+            << is.tellg();
+}
 
 template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
                                                  const std::vector<T> &v) {
diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc
index 63506073ff8..dd05326d5ed 100644
--- a/src/base/io-funcs-test.cc
+++ b/src/base/io-funcs-test.cc
@@ -43,8 +43,20 @@ void UnitTestIo(bool binary) {
     WriteIntegerVector(outfile, binary, vec2);
     if (!binary) outfile << " \n";
     std::vector<char> vec3;
-    for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100);
+
+    int32 size = RandInt(0, 10);
+    for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100);
     WriteIntegerVector(outfile, binary, vec3);
+    std::vector<std::pair<int32, int32> > vec4;
+    WriteIntegerPairVector(outfile, binary, vec4);
+    if (!binary && Rand()%2 == 0) outfile << " \n";
+    std::vector<std::pair<uint16, uint16> > vec5;
+    for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair<uint16, uint16>(Rand()%100 - 10, Rand()%100 - 10));
+    WriteIntegerPairVector(outfile, binary, vec5);
+    if (!binary) outfile << " \n";
+    std::vector<std::pair<char, char> > vec6;
+    for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair<char, char>(Rand()%100, Rand()%100));
+    WriteIntegerPairVector(outfile, binary, vec6);
     if (!binary && Rand()%2 == 0) outfile << " \n";
     const char *token1 = "Hi";
     WriteToken(outfile, binary, token1);
@@ -90,6 +102,15 @@ void UnitTestIo(bool binary) {
       std::vector<char> vec3_in;
       ReadIntegerVector(infile, binary_in, &vec3_in);
       KALDI_ASSERT(vec3_in == vec3);
+      std::vector<std::pair<int32, int32> > vec4_in;
+      ReadIntegerPairVector(infile, binary_in, &vec4_in);
+      KALDI_ASSERT(vec4_in == vec4);
+      std::vector<std::pair<uint16, uint16> > vec5_in;
+      ReadIntegerPairVector(infile, binary_in, &vec5_in);
+      KALDI_ASSERT(vec5_in == vec5);
+      std::vector<std::pair<char, char> > vec6_in;
+      ReadIntegerPairVector(infile, binary_in, &vec6_in);
+      KALDI_ASSERT(vec6_in == vec6);
       std::string  token1_in, token2_in;
       KALDI_ASSERT(Peek(infile, binary_in) == static_cast<int>(*token1));
       KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast<int>(*token1));
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index ba0cf1c1c7c..4caddc6b5b3 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
 //                      Jan Silovsky;   Yanmin Qian
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -181,6 +182,16 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
 template<class T> inline void ReadIntegerVector(std::istream &is, bool binary,
                                                 std::vector<T> *v);
 
+/// Function for writing STL vectors of pairs of integer types.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v);
+
+/// Function for reading STL vector of pairs of integer types.
+template<class T> 
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v);
+
 /// The WriteToken functions are for writing nonempty sequences of non-space
 /// characters. They are not for general strings.
 void WriteToken(std::ostream &os, bool binary, const char *token);
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index e28ddcc1a09..ac590a06a25 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -41,20 +41,19 @@
 #endif
 
 #ifndef M_PI
-#  define M_PI 3.1415926535897932384626433832795
+#define M_PI 3.1415926535897932384626433832795
 #endif
 
 #ifndef M_SQRT2
-#  define M_SQRT2 1.4142135623730950488016887
+#define M_SQRT2 1.4142135623730950488016887
 #endif
 
-
 #ifndef M_2PI
-#  define M_2PI 6.283185307179586476925286766559005
+#define M_2PI 6.283185307179586476925286766559005
 #endif
 
 #ifndef M_SQRT1_2
-# define M_SQRT1_2 0.7071067811865475244008443621048490
+#define M_SQRT1_2 0.7071067811865475244008443621048490
 #endif
 
 #ifndef M_LOG_2PI
@@ -65,6 +64,11 @@
 #define M_LN2 0.693147180559945309417232121458
 #endif
 
+#ifndef M_LN10
+#define M_LN10 2.302585092994045684017991454684
+#endif
+
+
 #define KALDI_ISNAN std::isnan
 #define KALDI_ISINF std::isinf
 #define KALDI_ISFINITE(x) std::isfinite(x)
diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc
index 13a3412a9bb..1ae1dc0b758 100644
--- a/src/base/kaldi-utils.cc
+++ b/src/base/kaldi-utils.cc
@@ -20,7 +20,9 @@
 #include <Synchapi.h>
 #elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
 #include <Windows.h>
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
+#endif /* _MSC_VER < 1900 */
 #else
 #include <unistd.h>
 #endif
diff --git a/src/bin/Makefile b/src/bin/Makefile
index ac175e42e0e..74b1b5de62b 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -5,7 +5,8 @@ include ../kaldi.mk
 
 BINFILES = align-equal align-equal-compiled acc-tree-stats \
         show-alignments compile-questions cluster-phones \
-        compute-wer make-h-transducer add-self-loops convert-ali \
+        compute-wer compute-wer-bootci make-h-transducer \
+        add-self-loops convert-ali \
         compile-train-graphs compile-train-graphs-fsts arpa2fst \
         make-pdf-to-tid-transducer make-ilabel-transducer show-transitions \
         ali-to-phones ali-to-post weight-silence-post acc-lda est-lda \
diff --git a/src/bin/analyze-counts.cc b/src/bin/analyze-counts.cc
index 60be710c79d..6c5d0328936 100644
--- a/src/bin/analyze-counts.cc
+++ b/src/bin/analyze-counts.cc
@@ -1,6 +1,6 @@
 // bin/analyze-counts.cc
 
-// Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely)
+// Copyright 2012-2016 Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -51,6 +51,15 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary, "write in binary mode");
     po.Register("symbol-table", &symbol_table_filename, "Read symbol table for display of counts");
 
+    int32 counts_dim = 0;
+    po.Register("counts-dim", &counts_dim, 
+                "Output dimension of the counts, a hint for dimension auto-detection.");
+
+    std::string frame_weights;
+    po.Register("frame-weights", &frame_weights, "Per-frame weights (counting weighted frames).");
+    std::string utt_weights;
+    po.Register("utt-weights", &utt_weights, "Per-utterance weights (counting weighted frames).");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -61,39 +70,78 @@ int main(int argc, char *argv[]) {
     std::string alignments_rspecifier = po.GetArg(1),
         wxfilename = po.GetArg(2);
 
-    SequentialInt32VectorReader reader(alignments_rspecifier);
+    SequentialInt32VectorReader alignment_reader(alignments_rspecifier);
 
-    // Get the counts
-    std::vector<uint64> counts;
-    int32 num_done = 0;
-    for (; !reader.Done(); reader.Next()) {
-      std::string key = reader.Key();
-      std::vector<int32> alignment = reader.Value();
+    RandomAccessBaseFloatVectorReader weights_reader;
+    if (frame_weights != "") {
+      weights_reader.Open(frame_weights);
+    }
+    RandomAccessBaseFloatReader utt_weights_reader;
+    if (utt_weights != "") {
+      utt_weights_reader.Open(utt_weights);
+    }
 
+    // Buffer for the counts,
+    Vector<double> counts(counts_dim, kSetZero);
+
+    // Get the counts,
+    int32 num_done = 0, num_other_error = 0;
+    for (; !alignment_reader.Done(); alignment_reader.Next()) {
+      std::string utt = alignment_reader.Key();
+      // check we have per-frame weights,
+      if (frame_weights != "" && !weights_reader.HasKey(utt)) {
+        KALDI_WARN << utt << ", missing per-frame weights";
+        num_other_error++;
+        continue;
+      }
+      // check we have per-utterance weights,
+      if (utt_weights != "" && !utt_weights_reader.HasKey(utt)) {
+        KALDI_WARN << utt << ", missing per-utterance weight";
+        num_other_error++;
+        continue;
+      }
+
+      // Get the alignment,
+      const std::vector<int32> &alignment = alignment_reader.Value();
+
+      // Get the weights,
+      BaseFloat utt_w = (utt_weights == "" ? 1.0 : utt_weights_reader.Value(utt));
+      Vector<BaseFloat> frame_w;
+      if (frame_weights != "") {
+        frame_w = weights_reader.Value(utt);
+        KALDI_ASSERT(frame_w.Dim() == alignment.size());
+      }
+      
+      // Accumulate the counts,
       for (size_t i = 0; i < alignment.size(); i++) {
-        int32 value = alignment[i];
-        if(value >= counts.size()) {
-          counts.resize(value+1);
+        // Extend the vector if needed,
+        if (alignment[i] >= counts.Dim()) {
+            Vector<double> tmp(counts);
+            counts.Resize(alignment[i]+1, kSetZero);
+            counts.Range(0, tmp.Dim()).CopyFromVec(tmp);
         }
-        counts[value]++; // Accumulate
+        // Accumulate,
+        counts(alignment[i]) += 1.0 * utt_w * (frame_weights == "" ? 1.0 : frame_w(i));
       }
 
       num_done++;
     }
 
-    // We need at least one occurence for each tgt, so there is no nan during decoding
-    std::vector<uint64> counts_nozero(counts);
-    for(size_t i = 0; i < counts.size(); i++) {
-      if(counts_nozero[i] == 0) {
-        KALDI_WARN << "Zero count for element " << i << ", force setting to one."
-                   << " This avoids divide-by-zero when we use the counts in decoding.";
-        counts_nozero[i]++;
+    // Report elements with zero counts (this is suspicious),
+    for (size_t i = 0; i < counts.Dim(); i++) {
+      if (0.0 == counts(i)) {
+        KALDI_WARN << "Zero count for label " << i << ", this is suspicious.";
       }
     }
 
-    // Write
+    // Add a ``half-frame'' to all the elements, 
+    // (avoids zero-counts, which would cause problems in decoding),
+    Vector<double> counts_nozero(counts);
+    counts_nozero.Add(0.5);
+
+    // Write,
     Output ko(wxfilename, binary);
-    WriteIntegerVector(ko.Stream(), binary, counts_nozero);
+    counts_nozero.Write(ko.Stream(), binary);
 
     ////
     //// THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG
@@ -108,16 +156,16 @@ int main(int argc, char *argv[]) {
             KALDI_ERR << "Could not read symbol table from file " << symbol_table_filename;
       }
       
-      // sort the counts
-      std::vector<std::pair<int32,int32> > sorted_counts;
-      for (int32 i = 0; i < counts.size(); i++) {
-        sorted_counts.push_back(std::make_pair(static_cast<int32>(counts[i]), i));
+      // sort the counts,
+      std::vector<std::pair<double,int32> > sorted_counts;
+      for (int32 i = 0; i < counts.Dim(); i++) {
+        sorted_counts.push_back(std::make_pair(static_cast<double>(counts(i)), i));
       }
       std::sort(sorted_counts.begin(), sorted_counts.end());
       
-      // print
+      // print,
       std::ostringstream os;
-      int32 sum = std::accumulate(counts.begin(),counts.end(), 0);
+      double sum = counts.Sum();
       os << "Printing...\n### The sorted count table," << std::endl;
       os << "count\t(norm),\tid\t(symbol):" << std::endl;
       for (int32 i=0; i<sorted_counts.size(); i++) {
@@ -133,7 +181,8 @@ int main(int argc, char *argv[]) {
       KALDI_LOG << os.str();
     }
 
-    KALDI_LOG << "Summed " << num_done << " int32 vectors to counts.";
+    KALDI_LOG << "Summed " << num_done << " int32 vectors to counts, " 
+              << "skipped " << num_other_error << " vectors.";
     KALDI_LOG << "Counts written to " << wxfilename;
     return 0;
   } catch(const std::exception &e) {
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
new file mode 100644
index 00000000000..d2324a7ca85
--- /dev/null
+++ b/src/bin/compute-wer-bootci.cc
@@ -0,0 +1,254 @@
+// bin/compute-wer-bootci.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+//                2014  Johns Hopkins University (authors: Jan Trmal, Daniel Povey)
+//                2015  Brno Universiry of technology (author: Karel Vesely)
+//                2016  Nicolas Serrano
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "util/parse-options.h"
+#include "tree/context-dep.h"
+#include "util/edit-distance.h"
+#include "base/kaldi-math.h"
+
+namespace kaldi {
+
+void GetEditsSingleHyp( const std::string &hyp_rspecifier, 
+      const std::string &ref_rspecifier,
+      const std::string &mode,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp) {
+    
+    // Both text and integers are loaded as vector of strings,
+    SequentialTokenVectorReader ref_reader(ref_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
+    int32 num_words = 0, word_errs = 0, num_ins = 0, num_del = 0, num_sub = 0;
+    
+    // Main loop, store WER stats per hyp,
+    for (; !ref_reader.Done(); ref_reader.Next()) {
+      std::string key = ref_reader.Key();
+      const std::vector<std::string> &ref_sent = ref_reader.Value();
+      std::vector<std::string> hyp_sent;
+      if (!hyp_reader.HasKey(key)) {
+        if (mode == "strict")
+          KALDI_ERR << "No hypothesis for key " << key << " and strict "
+              "mode specifier.";
+        if (mode == "present")  // do not score this one.
+          continue;
+      } else {
+        hyp_sent = hyp_reader.Value(key);
+      }
+      num_words = ref_sent.size();
+      word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, 
+                                            &num_ins, &num_del, &num_sub);
+      edit_word_per_hyp.push_back(std::pair<int32, int32>(word_errs, num_words));
+    }
+}
+
+void GetEditsDualHyp(const std::string &hyp_rspecifier, 
+      const std::string &hyp_rspecifier2, 
+      const std::string &ref_rspecifier,
+      const std::string &mode,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp2) {
+    
+    // Both text and integers are loaded as vector of strings,
+    SequentialTokenVectorReader ref_reader(ref_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader2(hyp_rspecifier2);
+    int32 num_words = 0, word_errs = 0, 
+            num_ins = 0, num_del = 0, num_sub = 0;
+    
+    // Main loop, store WER stats per hyp,
+    for (; !ref_reader.Done(); ref_reader.Next()) {
+      std::string key = ref_reader.Key();
+      const std::vector<std::string> &ref_sent = ref_reader.Value();
+      std::vector<std::string> hyp_sent, hyp_sent2;
+      if (mode == "strict" && 
+              (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) {
+          KALDI_ERR << "No hypothesis for key " << key << " in both transcripts "
+              "comparison is not possible.";
+      } else if (mode == "present" && 
+              (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key)))
+          continue;
+
+      num_words = ref_sent.size();
+
+      //all mode, if a hypothesis is not present, consider as an error
+      if(hyp_reader.HasKey(key)){
+        hyp_sent = hyp_reader.Value(key);
+        word_errs = LevenshteinEditDistance(ref_sent, hyp_sent, 
+                                            &num_ins, &num_del, &num_sub);
+      } 
+      else
+        word_errs = num_words;
+      edit_word_per_hyp.push_back(std::pair<int32, int32>(word_errs, num_words));
+
+      if(hyp_reader2.HasKey(key)){
+        hyp_sent2 = hyp_reader2.Value(key);
+        word_errs = LevenshteinEditDistance(ref_sent, hyp_sent2, 
+                                            &num_ins, &num_del, &num_sub);
+      }
+      else
+        word_errs = num_words;
+      edit_word_per_hyp2.push_back(std::pair<int32, int32>(word_errs, num_words));
+    }
+}
+
+void GetBootstrapWERInterval(
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      int32 replications, 
+      BaseFloat *mean, BaseFloat *interval) {
+    BaseFloat wer_accum = 0.0, wer_mult_accum = 0.0;
+
+    for (int32 i = 0; i <= replications; ++i) {
+      int32 num_words = 0, word_errs = 0;
+      for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) {
+        int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size());
+        word_errs += edit_word_per_hyp[random_pos].first;
+        num_words += edit_word_per_hyp[random_pos].second;
+        }
+
+      BaseFloat wer_rep = static_cast<BaseFloat>(word_errs) / num_words;
+      wer_accum += wer_rep;
+      wer_mult_accum += wer_rep*wer_rep;
+    }
+
+    // Compute mean WER and std WER
+    *mean = wer_accum / replications;
+    *interval = 1.96*sqrt(wer_mult_accum/replications-(*mean)*(*mean));
+}
+
+void GetBootstrapWERTwoSystemComparison(
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp2,
+      int32 replications, BaseFloat *p_improv) {
+    int32 improv_accum = 0.0;
+
+    for (int32 i = 0; i <= replications; ++i) {
+      int32 word_errs = 0;
+      for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) {
+        int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size());
+        word_errs += edit_word_per_hyp[random_pos].first - 
+                        edit_word_per_hyp2[random_pos].first;
+        }
+      if(word_errs > 0)
+        ++improv_accum;
+    }
+    // Compute mean WER and std WER
+    *p_improv = static_cast<BaseFloat>(improv_accum) / replications;
+}
+
+} //namespace kaldi
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+
+  try {
+    const char *usage =
+      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Take a reference and a transcription file, in integer or text format,\n"
+      "and outputs overall WER statistics to standard output along with its\n"
+      "confidence interval using the bootstrap methos of Bisani and Ney.\n"
+      "If a second transcription file corresponding to the same reference is\n" 
+      "provided, a bootstrap comparison of the two transcription is performed\n"
+      "to estimate the probability of improvement.\n"
+      "\n"
+      "Usage: compute-wer-bootci [options] <ref-rspecifier> <hyp-rspecifier> [<hyp2-rspecifier>]\n"
+      "E.g.: compute-wer-bootci --mode=present ark:data/train/text ark:hyp_text\n"
+      "or compute-wer-bootci ark:data/train/text ark:hyp_text ark:hyp_text2\n"
+      "See also: compute-wer\n";
+
+    ParseOptions po(usage);
+
+    std::string mode = "strict";
+    po.Register("mode", &mode,
+                "Scoring mode: \"present\"|\"all\"|\"strict\":\n"
+                "  \"present\" means score those we have transcriptions for\n"
+                "  \"all\" means treat absent transcriptions as empty\n"
+                "  \"strict\" means die if all in ref not also in hyp");
+
+    int32 replications = 10000;
+    po.Register("replications", &replications, 
+            "Number of replications to compute the intervals");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string ref_rspecifier = po.GetArg(1);
+    std::string hyp_rspecifier = po.GetArg(2);
+    std::string hyp2_rspecifier = (po.NumArgs() == 3?po.GetArg(3):"");
+
+    if (mode != "strict" && mode != "present" && mode != "all") {
+      KALDI_ERR << 
+          "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got "
+          << mode;
+    }
+
+    //Get editions per each utterance
+    std::vector<std::pair<int32, int32> > edit_word_per_hyp, edit_word_per_hyp2;
+    if(hyp2_rspecifier.empty())
+      GetEditsSingleHyp(hyp_rspecifier, ref_rspecifier, mode, edit_word_per_hyp);
+    else
+      GetEditsDualHyp(hyp_rspecifier, hyp2_rspecifier, ref_rspecifier, mode,
+              edit_word_per_hyp, edit_word_per_hyp2);
+
+    //Extract WER for a number of replications of the same size 
+    //as the hypothesis extracted
+    BaseFloat mean_wer = 0.0, interval = 0.0, 
+              mean_wer2 = 0.0, interval2 = 0.0, 
+              p_improv = 0.0;
+
+    GetBootstrapWERInterval(edit_word_per_hyp, replications, 
+            &mean_wer, &interval);
+
+    if(!hyp2_rspecifier.empty()) {
+      GetBootstrapWERInterval(edit_word_per_hyp2, replications, 
+              &mean_wer2, &interval2);
+
+      GetBootstrapWERTwoSystemComparison(edit_word_per_hyp, edit_word_per_hyp2,
+             replications, &p_improv);
+    }
+
+    // Print the output,
+    std::cout.precision(2);
+    std::cerr.precision(2);
+    std::cout << "Set1: %WER " << std::fixed << 100*mean_wer << 
+              " 95\% Conf Interval [ " << 100*mean_wer-100*interval << 
+              ", " << 100*mean_wer+100*interval << " ]" << '\n'; 
+
+    if(!hyp2_rspecifier.empty()) {
+        std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 << 
+            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 << 
+            ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
+
+        std::cout << "Probability of Set2 improving Set1: " << std::fixed << 
+            100*p_improv << '\n';
+    } 
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 20f58d52b7d..42404e38384 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -101,7 +101,8 @@ int32 TypeOneUsage(const ParseOptions &po) {
 }
 
 int32 TypeTwoUsage(const ParseOptions &po,
-                   bool binary) {
+                   bool binary,
+                   bool average = false) {
   KALDI_ASSERT(po.NumArgs() == 2);
   KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
                "vector-sum: first argument must be an rspecifier");
@@ -133,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po,
       }
     }
   }
+  
+  if (num_done > 0 && average) sum.Scale(1.0 / num_done);
 
   Vector<BaseFloat> sum_float(sum);
   WriteKaldiObject(sum_float, po.GetArg(2), binary);
@@ -199,12 +202,13 @@ int main(int argc, char *argv[]) {
         " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n"
         "See also: copy-vector, dot-weights\n";
         
-    bool binary;
+    bool binary, average = false;
     
     ParseOptions po(usage);
 
     po.Register("binary", &binary, "If true, write output as binary (only "
                 "relevant for usage types two or three");
+    po.Register("average", &average, "Do average instead of sum");
     
     po.Read(argc, argv);
 
@@ -219,7 +223,7 @@ int main(int argc, char *argv[]) {
                ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
                kNoWspecifier) {
       // input from a single table, output not to table.
-      exit_status = TypeTwoUsage(po, binary);
+      exit_status = TypeTwoUsage(po, binary, average);
     } else if (po.NumArgs() >= 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier &&
                ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == 
diff --git a/src/chain/Makefile b/src/chain/Makefile
index e24913c06f2..c02844767f8 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -12,7 +12,7 @@ OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o
 
 ifeq ($(CUDA), true)
-  OBJFILES += chain-kernels.o 
+  OBJFILES += chain-kernels.o
 endif
 
 LIBNAME = kaldi-chain
@@ -53,7 +53,7 @@ endif
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
            ../fstext/kaldi-fstext.a \
-           ../matrix/kaldi-matrix.a ../cudamatrix/kaldi-cudamatrix.a \
+           ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
           ../util/kaldi-util.a ../base/kaldi-base.a
 
 
diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h
index 7ea58038918..52e388a3f2e 100644
--- a/src/chain/chain-datastruct.h
+++ b/src/chain/chain-datastruct.h
@@ -45,7 +45,8 @@ extern "C" {
   };
 
 
-
+  // Search for this in chain-kernels.cu for an explanation.
+  enum { kThresholdingPowerOfTwo = 14 };
 
 }
 
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index a654ad7d05f..ceb61a550f0 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -139,77 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) {
 
   Vector<BaseFloat> avg_prob_float(avg_prob);
   initial_probs_ = avg_prob_float;
-  special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float);
-}
-
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state) {
-  int32 num_states = fst.NumStates(),
-      num_states_can_reach = 0;
-  KALDI_ASSERT(dest_state >= 0 && dest_state < num_states);
-  std::vector<bool> can_reach(num_states, false);
-  std::vector<std::vector<int32> > reverse_transitions(num_states);
-  for (int32 s = 0; s < num_states; s++)
-    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
-         aiter.Next())
-      reverse_transitions[aiter.Value().nextstate].push_back(s);
-  std::vector<int32> queue;
-  can_reach[dest_state] = true;
-  queue.push_back(dest_state);
-  num_states_can_reach++;
-  while (!queue.empty()) {
-    int32 state = queue.back();
-    queue.pop_back();
-    std::vector<int32>::const_iterator iter = reverse_transitions[state].begin(),
-        end = reverse_transitions[state].end();
-    for (; iter != end; ++iter) {
-      int32 prev_state = *iter;
-      if (!can_reach[prev_state]) {
-        can_reach[prev_state] = true;
-        queue.push_back(prev_state);
-        num_states_can_reach++;
-      }
-    }
-  }
-  KALDI_ASSERT(num_states_can_reach >= 1 &&
-               num_states_can_reach <= num_states);
-  return num_states_can_reach;
-}
-
-
-int32 DenominatorGraph::ComputeSpecialState(
-    const fst::StdVectorFst &fst,
-    const Vector<BaseFloat> &initial_probs) {
-  int32 num_states = initial_probs.Dim();
-  std::vector<std::pair<BaseFloat, int32> > pairs(num_states);
-  for (int32 i = 0; i < num_states; i++)
-    pairs.push_back(std::pair<BaseFloat, int32>(-initial_probs(i), i));
-  // the first element of each pair is the negative of the initial-prob,
-  // so when we sort, the highest initial-prob will be first.
-  std::sort(pairs.begin(), pairs.end());
-  // this threshold of 0.75 is pretty arbitrary.  We reject any
-  // state if it can't be reached by 75% of all other states.
-  // In practice we think that states will either be reachable by
-  // almost-all states, or almost-none (e.g. states that are active
-  // only at utterance-beginning), so this threshold shouldn't
-  // be too critical.
-  int32 min_states_can_reach = 0.75 * num_states;
-  for (int32 i = 0; i < num_states; i++) {
-    int32 state = pairs[i].second;
-    int32 n = NumStatesThatCanReach(fst, state);
-    if (n < min_states_can_reach) {
-      KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state "
-                 << "(for renormalization in fwd-bkwd), because it's only "
-                 << "reachable by " << n << " out of " << num_states
-                 << " states.";
-    } else {
-      return state;
-    }
-  }
-  KALDI_ERR << "Found no states that are reachable by at least "
-            << min_states_can_reach << " out of " << num_states
-            << " states.  This is unexpected.  Change the threshold";
-  return -1;
 }
 
 void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
@@ -261,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
   fst::Decode(fst, encoder);
 }
 
+// This static function, used in CreateDenominatorFst, sorts an
+// fst's states in decreasing order of number of transitions (into + out of)
+// the state.  The aim is to have states that have a lot of transitions
+// either into them or out of them, be numbered earlier, so hopefully
+// they will be scheduled first and won't delay the computation
+static void SortOnTransitionCount(fst::StdVectorFst *fst) {
+  // negative_num_transitions[i] will contain (before sorting), the pair
+  // ( -(num-transitions-into(i) + num-transition-out-of(i)), i)
+  int32 num_states = fst->NumStates();
+  std::vector<std::pair<int32, int32> > negative_num_transitions(num_states);
+  for (int32 i = 0; i < num_states; i++) {
+    negative_num_transitions[i].first = 0;
+    negative_num_transitions[i].second = i;
+  }
+  for (int32 i = 0; i < num_states; i++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(*fst, i); !aiter.Done();
+         aiter.Next()) {
+      negative_num_transitions[i].first--;
+      negative_num_transitions[aiter.Value().nextstate].first--;
+    }
+  }
+  std::sort(negative_num_transitions.begin(), negative_num_transitions.end());
+  std::vector<fst::StdArc::StateId> order(num_states);
+  for (int32 i = 0; i < num_states; i++)
+    order[negative_num_transitions[i].second] = i;
+  fst::StateSort(fst, order);
+}
+
 void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
   for (int32 i = 1; i <= 3; i++) {
     fst::PushSpecial(fst, fst::kDelta * 0.01);
@@ -414,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
 
   DenGraphMinimizeWrapper(&transition_id_fst);
 
+  SortOnTransitionCount(&transition_id_fst);
+
   *den_fst = transition_id_fst;
   CheckDenominatorFst(trans_model.NumPdfs(), *den_fst);
   PrintDenGraphStats(*den_fst);
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
index 8e5ee39e4bd..b2510651f39 100644
--- a/src/chain/chain-den-graph.h
+++ b/src/chain/chain-den-graph.h
@@ -88,13 +88,6 @@ class DenominatorGraph {
   // Note: we renormalize each HMM-state to sum to one before doing this.
   const CuVector<BaseFloat> &InitialProbs() const;
 
-  // returns the index of the HMM-state that has the highest value in
-  // InitialProbs (and which we believe will always be reachable from most other
-  // states... later on we may check this more carefully [TODO]).
-  // It's used in getting the 'arbitrary_scale' value to keep the alphas
-  // in a good dynamic range.
-  int32 SpecialHmmState() const { return special_hmm_state_; }
-
   // This function outputs a modifified version of the FST that was used to
   // build this object, that has an initial-state with epsilon transitions to
   // each state, with weight determined by initial_probs_; and has each original
@@ -116,23 +109,15 @@ class DenominatorGraph {
   // functions called from the constructor
   void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds);
 
-  // work out the initial-probs and the 'special state'
-  // Note, there are no final-probs; we treat all states as final
-  // with probability one [we have a justification for this..
-  // assuming it's roughly a well-normalized HMM, this makes sense;
-  // note that we train on chunks, so the beginning and end of a chunk
-  // appear at arbitrary points in the sequence.
-  // At both beginning and end of the chunk, we limit ourselves to
-  // only those pdf-ids that were allowed in the numerator sequence.
+  // work out the initial-probs.  Note, there are no final-probs; we treat all
+  // states as final with probability one [we have a justification for this..
+  // assuming it's roughly a well-normalized HMM, this makes sense; note that we
+  // train on chunks, so the beginning and end of a chunk appear at arbitrary
+  // points in the sequence.  At both beginning and end of the chunk, we limit
+  // ourselves to only those pdf-ids that were allowed in the numerator
+  // sequence.
   void SetInitialProbs(const fst::StdVectorFst &fst);
 
-  // return a suitable 'special' HMM-state used for normalizing probabilities in
-  // the forward-backward.  It has to have a reasonably high probability and be
-  // reachable from most of the graph.  returns a suitable state-index
-  // that we can set special_hmm_state_ to.
-  int32 ComputeSpecialState(const fst::StdVectorFst &fst,
-                            const Vector<BaseFloat> &initial_probs);
-
   // forward_transitions_ is an array, indexed by hmm-state index,
   // of start and end indexes into the transition_ array, which
   // give us the set of transitions out of this state.
@@ -152,23 +137,9 @@ class DenominatorGraph {
   // distribution of the HMM.  This isn't too critical.
   CuVector<BaseFloat> initial_probs_;
 
-  // The index of a somewhat arbitrarily chosen HMM-state that we
-  // use for adjusting the alpha probabilities.  It needs to be
-  // one that is reachable from all states (i.e. not a special
-  // state that's only reachable at sentence-start).  We choose
-  // whichever one has the greatest initial-prob.  It's set
-  // in SetInitialProbs().
-  int32 special_hmm_state_;
-
   int32 num_pdfs_;
 };
 
-// returns the number of states from which there is a path to
-// 'dest_state'.  Utility function used in selecting 'special' state
-// for normalization of probabilities.
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state);
-
 
 // Function that does acceptor minimization without weight pushing...
 // this is useful when constructing the denominator graph.
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index eaee850a999..258c33cd465 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -39,12 +39,23 @@ DenominatorComputation::DenominatorComputation(
         std::min<int32>(exp_nnet_output_transposed_.NumCols(),
                         static_cast<int32>(kMaxDerivTimeSteps) *
                         num_sequences_)),
-    alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_,
+    alpha_(frames_per_sequence_ + 1,
+           den_graph_.NumStates() * num_sequences_ + num_sequences_,
            kUndefined),
-    beta_(2, den_graph_.NumStates() * num_sequences_, kUndefined),
+    beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_,
+          kUndefined),
     tot_prob_(num_sequences_, kUndefined),
     tot_log_prob_(num_sequences_, kUndefined),
-    log_correction_term_(num_sequences_, kUndefined) {
+    log_correction_term_(num_sequences_, kUndefined),
+    ok_(true) {
+  KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
+               opts_.leaky_hmm_coefficient < 1.0);
+  // make sure the alpha sums and beta sums are zeroed.
+  alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
+                  num_sequences_).SetZero();
+  beta_.ColRange(den_graph_.NumStates() * num_sequences_,
+                 num_sequences_).SetZero();
+
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
   exp_nnet_output_transposed_.ApplyExp();
 }
@@ -70,13 +81,12 @@ void DenominatorComputation::AlphaFirstFrame() {
 void DenominatorComputation::AlphaGeneralFrame(int32 t) {
   KALDI_ASSERT(t > 0 && t <= frames_per_sequence_);
   BaseFloat *this_alpha = alpha_.RowData(t);
-  const BaseFloat *prev_alpha = alpha_.RowData(t - 1);
+  const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1);
   const Int32Pair *backward_transitions = den_graph_.BackwardTransitions();
   const DenominatorGraphTransition *transitions = den_graph_.Transitions();
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows(),
       num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = den_graph_.SpecialHmmState();
+      num_sequences = num_sequences_;
 
   // 'probs' is the matrix of pseudo-likelihoods for frame t - 1.
   CuSubMatrix<BaseFloat> probs(exp_nnet_output_transposed_, 0, num_pdfs,
@@ -90,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
 
     cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions,
-                           num_sequences, special_hmm_state, prob_data,
-                           probs.Stride(), prev_alpha, this_alpha);
+                           num_sequences, prob_data, probs.Stride(),
+                           prev_alpha_dash, this_alpha);
 
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -110,18 +120,19 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
           int32 pdf_id = trans_iter->pdf_id,
               prev_hmm_state = trans_iter->hmm_state;
           BaseFloat prob = prob_data[pdf_id * prob_stride + s],
-              this_prev_alpha = prev_alpha[prev_hmm_state * num_sequences + s];
+              this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s];
           this_tot_alpha += this_prev_alpha * transition_prob * prob;
         }
-        // Let arbitrary_scale be the inverse of the alpha value for the
-        // hmm-state indexed special_hmm_state_ on the previous frame (for this
-        // sequence); we multiply this into all the transition-probabilities
-        // from the previous frame to this frame, in both the forward and
-        // backward passes, in order to keep the alphas in a good numeric range.
-        // This won't affect the posteriors, but when computing the total
-        // likelihood we'll need to compensate for it later on.
+        // Let arbitrary_scale be the inverse of the alpha-sum value that we
+        // store in the same place we'd store the alpha for the state numbered
+        // 'num_hmm_states'. We multiply this into all the
+        // transition-probabilities from the previous frame to this frame, in
+        // both the forward and backward passes, in order to keep the alphas in
+        // a good numeric range.  This won't affect the posteriors, but when
+        // computing the total likelihood we'll need to compensate for it later
+        // on.
         BaseFloat arbitrary_scale =
-            1.0 / prev_alpha[special_hmm_state * num_sequences + s];
+            1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s];
         KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0);
         this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
       }
@@ -129,37 +140,89 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
   }
 }
 
+void DenominatorComputation::AlphaDash(int32 t) {
+  BaseFloat *this_alpha = alpha_.RowData(t);
+
+  // create a 'fake matrix' for the regular alphas- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> alpha_mat(this_alpha,
+                                   den_graph_.NumStates(),
+                                   num_sequences_,
+                                   num_sequences_);
+
+  // the alpha-dash is the sum of alpha over all states.
+  CuSubVector<BaseFloat> alpha_sum_vec(this_alpha +
+                                       den_graph_.NumStates() * num_sequences_,
+                                       num_sequences_);
+  alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0);
+
+  alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient,
+                      den_graph_.InitialProbs(),
+                      alpha_sum_vec);
+  // it's now alpha-dash.
+}
+
+// compute beta from beta-dash.
+void DenominatorComputation::Beta(int32 t) {
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
+  // create a 'fake matrix' for the regular beta-dash (which is
+  // the counterpart of alpha-dash)- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> beta_dash_mat(this_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
+  // making the t index implicit, the beta-dash-sum for each sequence is the sum
+  // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i.
+  CuSubVector<BaseFloat> beta_dash_sum_vec(
+      this_beta_dash + den_graph_.NumStates() * num_sequences_,
+      num_sequences_);
+  beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat,
+                              kTrans, den_graph_.InitialProbs(), 0.0);
+  // we are computing beta in place.  After the following, beta-dash-mat
+  // will contain the actual beta (i.e. the counterpart of alpha),
+  // not the beta-dash.
+  beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec);
+}
+
 BaseFloat DenominatorComputation::Forward() {
   AlphaFirstFrame();
-  for (int32 t = 1; t <= frames_per_sequence_; t++)
+  AlphaDash(0);
+  for (int32 t = 1; t <= frames_per_sequence_; t++) {
     AlphaGeneralFrame(t);
+    AlphaDash(t);
+  }
   return ComputeTotLogLike();
 }
 
 BaseFloat DenominatorComputation::ComputeTotLogLike() {
   tot_prob_.Resize(num_sequences_);
-  // View the last alpha as a matrix of size num-hmm-states by num-sequences.
-  CuSubMatrix<BaseFloat> last_alpha(alpha_.RowData(frames_per_sequence_),
-                                    den_graph_.NumStates(),
-                                    num_sequences_,
-                                    num_sequences_);
+  // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences.
+  CuSubMatrix<BaseFloat> last_alpha_dash(
+      alpha_.RowData(frames_per_sequence_),
+      den_graph_.NumStates(),
+      num_sequences_,
+      num_sequences_);
 
-  tot_prob_.AddRowSumMat(1.0, last_alpha, 0.0);
+  tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0);
   // we should probably add an ApplyLog() function that takes a vector argument.
   tot_log_prob_ = tot_prob_;
   tot_log_prob_.ApplyLog();
   BaseFloat tot_log_prob = tot_log_prob_.Sum();
 
-  // We now have to add something for the arbitrary scaling factor.  the
-  // inverses of all the alphas for hmm-states numbered zero, for t = 0
-  // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in the
-  // transition-probs, so we need to multiply them all together (not inversed)
-  // and add them as a correction term to the total log-likes.  Note: the
+  // We now have to add something for the arbitrary scaling factor.  [note: the
   // purpose of the arbitrary scaling factors was to keep things in a good
-  // floating-point range.
+  // floating-point range]
+  // The inverses of all the tot-alpha quantities, for t = 0
+  // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in
+  // the transition-probs, so we need to multiply them all together (not
+  // inversed) and add them as a correction term to the total log-likes.
+  // These tot-alpha quantities were stored in the same place that we would
+  // have stored the HMM-state numbered 'num_hmm_states'.
+  int32 num_hmm_states = den_graph_.NumStates();
   CuSubMatrix<BaseFloat> inv_arbitrary_scales(
       alpha_, 0, frames_per_sequence_,
-      num_sequences_ * den_graph_.SpecialHmmState(), num_sequences_);
+      num_sequences_ * num_hmm_states, num_sequences_);
   CuMatrix<BaseFloat> log_inv_arbitrary_scales(
       inv_arbitrary_scales);
   log_inv_arbitrary_scales.ApplyLog();
@@ -170,12 +233,16 @@ BaseFloat DenominatorComputation::ComputeTotLogLike() {
 
 
 
-void DenominatorComputation::Backward(
+bool DenominatorComputation::Backward(
     BaseFloat deriv_weight,
     CuMatrixBase<BaseFloat> *nnet_output_deriv) {
-  BetaLastFrame();
+  BetaDashLastFrame();
+  Beta(frames_per_sequence_);
   for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) {
-    BetaGeneralFrame(t);
+    BetaDashGeneralFrame(t);
+    if (GetVerboseLevel() >= 1 || t == 0)
+      BetaGeneralFrameDebug(t);
+    Beta(t);
     if (t % kMaxDerivTimeSteps == 0) {
       // commit the derivative stored in exp_nnet_output_transposed_ by adding
       // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'.
@@ -190,35 +257,35 @@ void DenominatorComputation::Backward(
           *nnet_output_deriv,
           t * num_sequences_, chunk_frames * num_sequences_,
           0, num_pdfs);
-      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part,
-                               kTrans);
+      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans);
       if (t != 0)
         transposed_deriv_part.SetZero();
     }
   }
+  return ok_;
 }
 
-void DenominatorComputation::BetaLastFrame() {
-  // sets up the beta on the last frame (frame == frames_per_sequence_).  Note that
-  // the betas we use here contain a 1/(tot-prob) factor in order to simplify
-  // the backprop.
+void DenominatorComputation::BetaDashLastFrame() {
+  // sets up the beta-dash quantity on the last frame (frame ==
+  // frames_per_sequence_).  Note that the betas we use here contain a
+  // 1/(tot-prob) factor in order to simplify the backprop.
 
   int32 t = frames_per_sequence_;
-  BaseFloat *last_frame_beta = beta_.RowData(t % 2);
+  BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2);
 
   // create a 'fake matrix' - view this row as a matrix.
-  CuSubMatrix<BaseFloat> beta_mat(last_frame_beta,
-                                  den_graph_.NumStates(),
-                                  num_sequences_,
-                                  num_sequences_);
+  CuSubMatrix<BaseFloat> beta_dash_mat(last_frame_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
   CuVector<BaseFloat> inv_tot_prob(tot_prob_);
   inv_tot_prob.InvertElements();
   // the beta values at the end of the file only vary with the sequence-index,
   // not with the HMM-index.  We treat all states as having a final-prob of one.
-  beta_mat.CopyRowsFromVec(inv_tot_prob);
+  beta_dash_mat.CopyRowsFromVec(inv_tot_prob);
 }
 
-void DenominatorComputation::BetaGeneralFrame(int32 t) {
+void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
   KALDI_ASSERT(t >= 0 && t < frames_per_sequence_);
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows();
   // t_wrapped gives us the time-index we use when indexing
@@ -226,9 +293,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
   // matrix, storing only chunks of frames at a time, and we add it to the
   // non-transposed output whenever we finish a chunk.
   int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps);
-  const BaseFloat *this_alpha = alpha_.RowData(t),
+  const BaseFloat *this_alpha_dash = alpha_.RowData(t),
       *next_beta = beta_.RowData((t + 1) % 2);
-  BaseFloat *this_beta = beta_.RowData(t % 2);
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
   const Int32Pair *forward_transitions = den_graph_.ForwardTransitions();
   const DenominatorGraphTransition *transitions = den_graph_.Transitions();
   // 'probs' is the matrix of pseudo-likelihoods for frame t.
@@ -238,8 +305,7 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
                      t_wrapped * num_sequences_, num_sequences_);
 
   int32 num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = den_graph_.SpecialHmmState();
+      num_sequences = num_sequences_;
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -247,10 +313,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
     dim3 dimBlock(std::min<int32>(CU1DBLOCK, num_sequences), 1, 1);
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
     cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions,
-                            num_sequences, special_hmm_state,
-                            probs.Data(), probs.Stride(), this_alpha, next_beta,
-                            this_beta, log_prob_deriv.Data(),
-                            log_prob_deriv.Stride());
+                            num_sequences, probs.Data(), probs.Stride(),
+                            this_alpha_dash, next_beta, this_beta_dash,
+                            log_prob_deriv.Data(), log_prob_deriv.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -262,12 +327,12 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
     BaseFloat *log_prob_deriv_data = log_prob_deriv.Data();
     for (int32 h = 0; h < num_hmm_states; h++) {
       for (int32 s = 0; s < num_sequences; s++) {
-        BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
+        BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s],
             inv_arbitrary_scale =
-            this_alpha[special_hmm_state * num_sequences + s];
+            this_alpha_dash[num_hmm_states * num_sequences + s];
         double tot_variable_factor = 0.0;
-        BaseFloat
-            occupation_factor = this_alpha_prob / inv_arbitrary_scale;
+        BaseFloat occupation_factor = this_alpha_dash_prob /
+            inv_arbitrary_scale;
         const DenominatorGraphTransition
             *trans_iter = transitions + forward_transitions[h].first,
             *trans_end = transitions + forward_transitions[h].second;
@@ -282,13 +347,49 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
           BaseFloat occupation_prob = variable_factor * occupation_factor;
           log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob;
         }
-        this_beta[h * num_sequences + s] =
+        this_beta_dash[h * num_sequences + s] =
             tot_variable_factor / inv_arbitrary_scale;
       }
     }
   }
 }
 
+void DenominatorComputation::BetaGeneralFrameDebug(int32 t) {
+  BaseFloat num_hmm_states = den_graph_.NumStates(),
+      alpha_beta_size = num_hmm_states * num_sequences_;
+  CuSubVector<BaseFloat> this_alpha_dash(alpha_.RowData(t), alpha_beta_size),
+      this_beta_dash(beta_.RowData(t % 2), alpha_beta_size);
+  int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps),
+      num_pdfs = exp_nnet_output_transposed_.NumRows();
+  CuSubMatrix<BaseFloat> this_log_prob_deriv(
+      nnet_output_deriv_transposed_, 0, num_pdfs,
+      t_wrapped * num_sequences_, num_sequences_);
+  BaseFloat alpha_beta_product = VecVec(this_alpha_dash,
+                                        this_beta_dash),
+      this_log_prob_deriv_sum = this_log_prob_deriv.Sum();
+  if (!ApproxEqual(alpha_beta_product, num_sequences_)) {
+    KALDI_WARN << "On time " << t << ", alpha-beta product "
+               << alpha_beta_product << " != " << num_sequences_
+               << " alpha-dash-sum = " << this_alpha_dash.Sum()
+               << ", beta-dash-sum = " << this_beta_dash.Sum();
+    if (fabs(alpha_beta_product - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+  // use higher tolerance, since we are using randomized pruning for the
+  // log-prob derivatives.
+  if (!ApproxEqual(this_log_prob_deriv_sum,
+                   num_sequences_, 0.01)) {
+    KALDI_WARN << "On time " << t << ", log-prob-deriv sum "
+               << this_log_prob_deriv_sum << " != " << num_sequences_;
+    if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+}
+
 
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index f3b0afa6721..b0f616673d6 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -41,6 +41,153 @@ namespace kaldi {
 namespace chain {
 
 
+/*
+  This extended comment describes how we implement forward-backward without log
+  and without overflow, and also the leaky-HMM idea.
+
+  We'll start by establishing the notation for conventional forward-backward,
+  then add the 'arbitrary-scale' concept that prevents overflow, and then
+  add the 'leaky-hmm' concept.
+
+  All this is done in parallel over multiple sequences, but the computations
+  are independent over the separate sequences, so we won't introduce any notation
+  or index for the sequence; we'll just explain it for one sequences.
+
+  Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
+  hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
+  pred(i) give a list of arcs entering state i, and we'll use notation like:
+    for (j, p, n) in foll(i):
+  for iterating over those arcs, where in this case j is the destination-state,
+  p is the transition-probability of the arc and n is the pdf-id index.
+  We can then look up the emission probability as x(t, n) for some frame
+  0 <= t < T.
+
+  ** Version 1 of the computation (naive version) **
+
+  * Forward computation (version 1)
+
+  In the forward computation we're computing alpha(i, t) for 0 <= t <= T):
+    - For the first frame, set alpha(0, i) = init(i), where init(i) is the
+      initial-probabilitiy from state i.  # in our framework these are obtained
+      #  by running the HMM for a while and getting an averaged occupation
+      # probability, and using this as an initial-prob, since the boundaries of
+      # chunks don't really correspond to utterance boundaries in general.]
+    - For t = 1 ... T:
+        for i = 0 ... I-1:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p.
+
+    - total-prob = \sum_i alpha(T, i).  # note, we take the final-probs of all states
+                                        # to be 1.0.
+
+  * Backward computation (version 1)
+
+  And now for the backward computation.  Contrary to tradition, we include the
+  inverse of the total-prob as a factor in the betas.  This is both more
+  convenient (it simplifies the way we obtain posteriors), and makes the
+  algorithm more generalizable as all the beta quantities can be interpreted as
+  the partial derivative of the logprob with respect to their corresponding
+  alpha.
+
+  In forward backward notation, gamma is normally used for state-level
+  occupation probabilities, but what we care about here is pdf-id-level
+  occupation probabilities (i.e. the partial derivative of the log-likelihood
+  w.r.t. the logs of the x(t, n) quantities), so we use gamma for that.
+
+    - for the final frame:
+       for each i, beta(T, i) = 1 / total-prob.
+    - for t = T-1 ... 0:
+        for i = 0 ... I-1:
+           beta(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta(t, i) += x(t, n) * beta(t+1, j) * p.
+              gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p.
+
+  ** Version 2 of the computation (renormalized version) **
+
+  Version 1 of the algorithm is susceptible to numeric underflow and overflow,
+  due to the limited range of IEEE floating-point exponents.
+  Define tot-alpha(t) = \sum_i alpha(t, i).  Then the renormalized version of
+  the computation is as above, except whenever the quantity x(t, n) appears,
+  we replace it with x(t, n) / alpha(t).  In the algorithm we refer to
+  1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any
+  value here as long as we are consistent and the value only varies with t
+  and not with n; we'll always get the same posteriors (gamma).
+
+  When the algorithm outputs log(total-prob) as the total log-probability
+  of the HMM, we have to instead return the expression:
+    log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t).
+  to correct for the scaling of the x values.
+
+  The algorithm is still vulnerable to overflow in the beta computation because
+  it's possible that the dominant path could have a very tiny alpha.  However,
+  once we introduce the leaky-HMM idea (below), this problem will disappear.
+
+  ** Version 3 of the computation (leaky-HMM version) **
+
+  The leaky-HMM idea is intended to improve generalization by allowing paths
+  other than those explicitly allowed by the FST we compiled.  Another way to
+  look at it is as a way of hedging our bets about where we split the utterance,
+  so it's as we're marginalizing over different splits of the utterance.  You
+  could also think of it as a modification of the FST so that there is an
+  epsilon transition from each state to a newly added state, with probability
+  one, and then an epsilon transition from the newly added state to each state
+  with probability leaky-hmm-prob * init(i) [except we need a mechanism so that
+  no more than two epsilon transitions can be taken per frame- this would involve
+  creating two copies of the states]
+
+  Recall that we mentioned that init(i) is the initial-probability of
+  HMM-state i, but these are obtained in such a way that they can be treated
+  as priors, or average occupation-probabilities.
+
+  Anyway, the way we formulate leaky-hmm is as follows:
+
+  * Forward computation (version 3)
+
+  Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical
+  value.  It defines how much probability we give to the 'leaky' transitions.
+
+  - For frame 0, set alpha(0, i) = init(i).
+  - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i).
+  - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i).
+
+  - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use
+      the previous frame's alpha' instead of alpha.  That is:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1)
+
+  - total-prob = \sum_i alpha'(T, i)
+
+  The corrected log-prob that we return from the algorithm will be
+   (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)).
+
+  * Backward computation (version 3)
+
+  The backward computation is as follows.  It is fairly straightforward to
+  derive if you think of it as an instance of backprop where beta, tot-beta and
+  beta' are the partial derivatives of the output log-prob w.r.t. the
+  corresponding alpha, tot-alpha and alpha' quantities.  Note, tot-beta is not
+  really the sum of the betas as its name might suggest, it's just the
+  derivative w.r.t. tot-alpha.
+
+   - beta'(T, i) = 1 / total-prob.
+   - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i)
+   - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t).
+   - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows:
+        for 0 <= i < I:
+           beta'(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t)
+              gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p *  x(t, n) / tot-alpha(t)
+
+   Note: in the code, the tot-alpha and tot-beta quantities go in the same
+   memory location that the corresponding alpha and beta for state I would go.
+
+ */
+
+
 // This does forward-backward in parallel on a number of sequences, using a
 // single HMM.
 class DenominatorComputation {
@@ -70,7 +217,8 @@ class DenominatorComputation {
 
   // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
   // nnet output), to 'nnet_output_deriv'.
-  void Backward(BaseFloat deriv_weight,
+  // returns true if everything seemed OK, false if a failure was detected.
+  bool Backward(BaseFloat deriv_weight,
                 CuMatrixBase<BaseFloat> *nnet_output_deriv);
 
  private:
@@ -84,6 +232,9 @@ class DenominatorComputation {
   void AlphaFirstFrame();
   // the alpha computation for some 0 < t <= num_time_steps_.
   void AlphaGeneralFrame(int32 t);
+  // does the 'alpha-dash' computation for time t.  this relates to
+  // 'leaky hmm'.
+  void AlphaDash(int32 t);
 
   // done after all the alphas, this function computes and returns the total
   // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're
@@ -92,9 +243,15 @@ class DenominatorComputation {
   // from the Forward() computation).
   BaseFloat ComputeTotLogLike();
 
-  void BetaLastFrame();
+  void BetaDashLastFrame();
   // beta computation for 0 <= beta < num_time_steps_.
-  void BetaGeneralFrame(int32 t);
+  void BetaDashGeneralFrame(int32 t);
+  // compute the beta quantity from the beta-dash quantity (relates to leaky hmm).
+  void Beta(int32 t);
+
+  // some checking that we can do if debug mode is activated, or on frame zero.
+  // Sets ok_ to false if a bad problem is detected.
+  void BetaGeneralFrameDebug(int32 t);
 
   const ChainTrainingOptions &opts_;
   const DenominatorGraph &den_graph_;
@@ -116,13 +273,18 @@ class DenominatorComputation {
   // the derivs w.r.t. the nnet outputs (transposed)
   CuMatrix<BaseFloat> nnet_output_deriv_transposed_;
 
-  // the alpha probabilities; dimension is (frames_per_sequence + 1) by (num-hmm-states
-  // * num-sequences).  Note, they are not logs.
+  // the (temporarily) alpha and (more permanently) alpha-dash probabilities;
+  // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences +
+  // num_sequences).  Note, they are not logs.  The last 'num_sequences'
+  // columns, where the alpha for the state indexed 'num_hmm_states' would live,
+  // are for the alpha-sums, which relates to leaky HMM.
   CuMatrix<BaseFloat> alpha_;
 
-  // the beta probabilities (rolling buffer); dimension is 2 * (num-hmm-states *
-  // num-sequences).  Note: for efficiency and to simplify the equations, these
-  // are actually the beta / tot_prob_.
+  // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 *
+  // (num-hmm-states * num-sequences + num_sequences).  [the last
+  // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.]
+  // Note: for efficiency and to simplify the equations, these are actually the
+  // beta / tot_prob_.
   CuMatrix<BaseFloat> beta_;
 
   // the total probability for each sequence, excluding the product of
@@ -136,11 +298,13 @@ class DenominatorComputation {
   CuVector<BaseFloat> tot_log_prob_;
 
   // the log of the total correction term for each sequence, which is the
-  // product of the alpha_[special hmm state] over all the frames.  The
-  // 'correction terms' are terms that we divide the alphas and betas by in
-  // order to keep them in a good dynamic range.  The product of them
-  // must be included in the total likelihood.
+  // product of the alpha-sums [used in the leaky-hmm computation] over all the
+  // frames.  The 'correction terms' are terms that we divide the alphas and
+  // betas by in order to keep them in a good dynamic range.  The product of
+  // them must be included in the total likelihood.
   CuVector<BaseFloat> log_correction_term_;
+
+  bool ok_;
 };
 
 
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index af7a1a6b176..8ec1dcf322c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -29,7 +29,6 @@ extern "C" {
                                const Int32Pair *forward_transitions,
                                const DenominatorGraphTransition *transitions,
                                int32_cuda num_sequences,
-                               int32_cuda special_hmm_state,
                                const BaseFloat *probs,
                                int32_cuda prob_stride,
                                const BaseFloat *this_alpha,
@@ -42,7 +41,6 @@ extern "C" {
                               const Int32Pair *backward_transitions,
                               const DenominatorGraphTransition *transitions,
                               int32_cuda num_sequences,
-                              int32_cuda special_hmm_state,
                               const BaseFloat *probs,
                               int32_cuda prob_stride,
                               const BaseFloat *prev_alpha,
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 8fcf8037d36..ea10b6680f0 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -40,9 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) {
   // threshold itself with probability (value / threshold).  This preserves
   // expectations.  Note: we assume that value >= 0.
 
-  // you can choose any value for the threshold, but powers of 2 are nice
-  // because they will exactly preserve the precision of the value.
-  const Real threshold = 1.0 / (1 << 14);
+  // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines
+  // the threshold for randomized posterior pruning.
+  const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo);
   if (value >= threshold) {
     atomic_add(address, value);
   } else {
@@ -67,7 +67,6 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) {
     if ((x >> 12) > (x & 4095))
       atomic_add(address, threshold);
   }
-
 }
 
 // one iteration of the forward computation in the 'tombstone' CTC HMM computation.
@@ -82,7 +81,6 @@ __global__
 static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
                                     const DenominatorGraphTransition *transitions,
                                     int32_cuda num_sequences,
-                                    int32_cuda special_hmm_state,
                                     const BaseFloat *probs,
                                     int32_cuda prob_stride,
                                     const BaseFloat *prev_alpha,
@@ -137,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
     this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0;
   }
 
-  // Let arbitrary_scale be the inverse of the alpha value for the
-  // hmm-state indexed special_hmm_state_ on the previous frame (for this
-  // sequence); we multiply this into all the transition-probabilities
-  // from the previous frame to this frame, in both the forward and
-  // backward passes, in order to keep the alphas in a good numeric range.
-  // This won't affect the posteriors, but when computing the total
-  // likelihood we'll need to compensate for it later on.
+  int32_cuda num_hmm_states = gridDim.y;
+  // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the
+  // previous frame this sum of all the alpha values is stored in the place that
+  // we'd store the previous alpha for state-index equal to num_hmm_states
+  // (i.e. one past the end).  We multiply this into all the
+  // transition-probabilities from the previous frame to this frame, in both the
+  // forward and backward passes, in order to keep the alphas in a good numeric
+  // range.  This won't affect the posteriors, as it's just a constant factor
+  // for each frame, but when computing the total likelihood we'll need to
+  // compensate for it later on.
   BaseFloat arbitrary_scale =
-      1.0 / prev_alpha[special_hmm_state * num_sequences + s];
+      1.0 / prev_alpha[num_hmm_states * num_sequences + s];
   this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
 }
 
@@ -154,7 +155,6 @@ __global__
 static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
                                      const DenominatorGraphTransition *transitions,
                                      int32_cuda num_sequences,
-                                     int32_cuda special_hmm_state,
                                      const BaseFloat *probs, int32_cuda prob_stride,
                                      const BaseFloat *this_alpha, const BaseFloat *next_beta,
                                      BaseFloat *this_beta, BaseFloat *log_prob_deriv,
@@ -179,10 +179,14 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
   if (s >= num_sequences)
     return;
 
+  // below, you can read 'gridDim.y' as 'num_hmm_states'.  See where
+  // arbitrary_scale is defined in the forward computation above, for more
+  // explanation.
   BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
       inv_arbitrary_scale =
-      this_alpha[special_hmm_state * num_sequences + s];
+      this_alpha[gridDim.y * num_sequences + s];
   double tot_variable_factor = 0.0;
+
   BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale;
   const DenominatorGraphTransition
       *trans_iter = transitions + forward_transitions[h].first,
@@ -223,7 +227,8 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
     atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s),
                            occupation_prob0);
   }
-  this_beta[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale;
+  BaseFloat beta = tot_variable_factor / inv_arbitrary_scale;
+  this_beta[h * num_sequences + s] = beta;
 }
 
 
@@ -231,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl,
                             const Int32Pair *backward_transitions,
                             const DenominatorGraphTransition *transitions,
                             int32_cuda num_sequences,
-                            int32_cuda special_hmm_state,
                             const BaseFloat *probs, int32_cuda prob_stride,
                             const BaseFloat *prev_alpha,
                             BaseFloat *this_alpha) {
   _cuda_chain_hmm_forward<<<Gr,Bl>>>(backward_transitions, transitions,
-                                     num_sequences, special_hmm_state,
-                                     probs, prob_stride, prev_alpha, this_alpha);
+                                     num_sequences, probs, prob_stride,
+                                     prev_alpha, this_alpha);
 }
 
 void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                              const Int32Pair *forward_transitions,
                              const DenominatorGraphTransition *transitions,
                              int32_cuda num_sequences,
-                             int32_cuda special_hmm_state,
                              const BaseFloat *probs, int32_cuda prob_stride,
                              const BaseFloat *this_alpha, const BaseFloat *next_beta,
                              BaseFloat *this_beta,
                              BaseFloat *log_prob_deriv,
                              int32_cuda log_prob_deriv_stride) {
   _cuda_chain_hmm_backward<<<Gr,Bl>>>(forward_transitions, transitions,
-                                      num_sequences, special_hmm_state,
-                                      probs, prob_stride, this_alpha, next_beta,
+                                      num_sequences, probs, prob_stride,
+                                      this_alpha, next_beta,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
index 1dc9d9d489d..15cb31e0571 100644
--- a/src/chain/chain-numerator.h
+++ b/src/chain/chain-numerator.h
@@ -76,8 +76,8 @@ class NumeratorComputation {
   BaseFloat Forward();
 
   // Does the backward computation and (efficiently) adds the derivative of the
-  // nnet output w.r.t. the (log-prob times supervision_.weight) to
-  // 'nnet_output_deriv'.
+  // nnet output w.r.t. the (log-prob times supervision_.weight times
+  // deriv_weight) to 'nnet_output_deriv'.
   void Backward(CuMatrixBase<BaseFloat> *nnet_output_deriv);
 
  private:
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index e6a333317e8..ea673df3291 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -251,15 +251,17 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
     nnet_output.SetRandn();
 
   ChainTrainingOptions opts;
+  if (RandInt(0, 1) == 1)
+    opts.leaky_hmm_coefficient = 0.2;
 
   CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
                                         nnet_output.NumCols(),
                                         kUndefined);
 
-  BaseFloat objf, weight;
+  BaseFloat objf, l2_term, weight;
 
   ComputeChainObjfAndDeriv(opts, den_graph, supervision,
-                           nnet_output, &objf, &weight,
+                           nnet_output, &objf, &l2_term, &weight,
                            &nnet_output_deriv);
 
   {
@@ -296,11 +298,12 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
     CuMatrix<BaseFloat> nnet_output_perturbed(nnet_delta_output);
     nnet_output_perturbed.AddMat(1.0, nnet_output);
 
-    BaseFloat objf_modified, weight_modified;
+    BaseFloat objf_modified, l2_term_modified, weight_modified;
 
     ComputeChainObjfAndDeriv(opts, den_graph, supervision,
                              nnet_output_perturbed,
-                             &objf_modified, &weight_modified,
+                             &objf_modified, &l2_term_modified,
+                             &weight_modified,
                              NULL);
 
     observed_objf_changes(p) = objf_modified - objf;
@@ -419,21 +422,6 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) {
                  10.0);
   }
 
-  { // another check: that scaling the initial probs has the expected effect.
-    BaseFloat scale = 0.1 + 0.7 * RandUniform();
-    DenominatorGraph den_graph_scaled(den_graph);
-    den_graph_scaled.ScaleInitialProbs(scale);
-    DenominatorComputation denominator_computation_scaled_initial(
-        opts, den_graph_scaled,
-        num_sequences, nnet_output);
-    BaseFloat forward_prob_scaled_initial =
-        denominator_computation_scaled_initial.Forward();
-    BaseFloat observed_difference =
-        forward_prob_scaled_initial - forward_prob,
-        predicted_difference = num_sequences * log(scale);
-    AssertEqual(observed_difference, predicted_difference);
-  }
-
   int32 num_tries = 5;
   BaseFloat epsilon = 1.0e-04;
   Vector<BaseFloat> predicted_objf_changes(num_tries),
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 03fdb3cbe64..7d699600bee 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -800,5 +800,27 @@ void GetWeightsForRanges(int32 range_length,
   }
 }
 
+
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,                            
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights) {
+  KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length);
+  int32 num_ranges = range_starts.size();
+  weights->resize(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    (*weights)[i].Resize(range_length);
+    (*weights)[i].Set(1.0);
+  }
+  if (num_frames_zeroed == 0)
+    return;
+  for (int32 i = 1; i < num_ranges; i++)
+    (*weights)[i].Range(0, num_frames_zeroed).Set(0.0);
+  for (int32 i = 0; i + 1 < num_ranges; i++)
+    (*weights)[i].Range(range_length - num_frames_zeroed,
+                        num_frames_zeroed).Set(0.0);
+}
+
+
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index b17f62d00ad..2dda8baf1e4 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -355,7 +355,7 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst,
 /// all the same it will only append Supervision objects where successive ones
 /// have the same weight and num-frames, and if 'compactify' is true.  The
 /// normal use-case for this is when you are combining neural-net examples for
-/// training; appending them like this helps to simplify the decoding process.
+/// training; appending them like this helps to simplify the training process.
 
 /// This function will crash if the values of label_dim in the inputs are not
 /// all the same.
@@ -402,6 +402,28 @@ void GetWeightsForRanges(int32 range_length,
                          std::vector<Vector<BaseFloat> > *weights);
 
 
+/// This is a newer version of GetWeightsForRanges with a simpler behavior
+/// than GetWeightsForRanges and a different purpose.  Instead of aiming to
+/// create weights that sum to one over the whole file, the purpose is to
+/// zero out the derivative weights for a certain number of frames to each
+/// side of every 'cut point' in the numerator lattice [by numerator lattice,
+/// what I mean is the FST that we automatically generate from the numerator
+/// alignment or lattice].  So we don't zero out the weights for the very
+/// beginning or very end of each original utterance, just those where
+/// we split the utterance into pieces.  We believe there is an incentive
+/// for the network to produce deletions near the edges, and this aims to fix
+/// this problem.
+/// range_length is the length of each range of times (so range_starts[0]
+/// represents the start of a range of t values of length 'range_length'
+/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames
+/// on each side of the cut point on which we are supposed to zero out the
+/// derivative.
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights);
+
+
 typedef TableWriter<KaldiObjectHolder<Supervision> > SupervisionWriter;
 typedef SequentialTableReader<KaldiObjectHolder<Supervision> > SequentialSupervisionReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<Supervision> > RandomAccessSupervisionReader;
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 42cdfed2713..1bf0201fbfa 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -29,9 +29,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
-                              BaseFloat *tot_objf,
-                              BaseFloat *tot_weight,
-                              CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,                              
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv) {
   BaseFloat num_logprob_weighted;
   if (nnet_output_deriv)
     nnet_output_deriv->SetZero();
@@ -40,29 +42,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
     // note: supervision.weight is included as a factor in the derivative from
     // the numerator object, and the logprob too.
     num_logprob_weighted = numerator.Forward();
-    if (nnet_output_deriv)
+    if (nnet_output_deriv) {
       numerator.Backward(nnet_output_deriv);
+      if (xent_output_deriv)
+        xent_output_deriv->CopyFromMat(*nnet_output_deriv);
+    } else if (xent_output_deriv) {
+      // this branch will be taken if xent_output_deriv but not
+      // nnet_output_deriv is set- which could happen if you want to compute the
+      // cross-entropy objective but not the derivatives.
+      xent_output_deriv->SetZero();
+      numerator.Backward(xent_output_deriv);
+    }
   }
   DenominatorComputation denominator(opts, den_graph,
                                      supervision.num_sequences,
                                      nnet_output);
 
   BaseFloat den_logprob = denominator.Forward();
+  bool ok = true;
   if (nnet_output_deriv)
-    denominator.Backward(-supervision.weight,
-                         nnet_output_deriv);
+    ok = denominator.Backward(-supervision.weight,
+                              nnet_output_deriv);
 
-  *tot_objf = num_logprob_weighted - supervision.weight * den_logprob;
-  *tot_weight = supervision.weight * supervision.num_sequences *
+  *objf = num_logprob_weighted - supervision.weight * den_logprob;
+  *weight = supervision.weight * supervision.num_sequences *
       supervision.frames_per_sequence;
-  if (!(*tot_objf  == *tot_objf)) {
-    // inf or NaN detected
+  if (!((*objf) - (*objf) == 0) || !ok) {
+    // inf or NaN detected, or denominator computation returned false.
     if (nnet_output_deriv)
       nnet_output_deriv->SetZero();
+    if (xent_output_deriv)
+      xent_output_deriv->SetZero();
     BaseFloat default_objf = -10;
-    KALDI_WARN << "Objective function is " << (*tot_objf)
-               << ", setting to " << default_objf << " per frame.";
-    *tot_objf  = default_objf * *tot_weight;
+    KALDI_WARN << "Objective function is " << (*objf)
+               << " and denominator computation (if done) returned "
+               << std::boolalpha << ok
+               << ", setting objective function to " << default_objf
+               << " per frame.";
+    *objf  = default_objf * *weight;
   }
 
   // This code helps us see how big the derivatives are, on average,
@@ -81,6 +98,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
       row_products_per_frame(i / num_sequences) += row_products_cpu(i);
     KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
   }
+
+  if (opts.l2_regularize == 0.0) {
+    *l2_term = 0.0;
+  } else {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision.weight * opts.l2_regularize;
+    *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans);
+    if (nnet_output_deriv)
+      nnet_output_deriv->AddMat(-1.0 * scale, nnet_output);
+  }
 }
 
 
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 8eb7e8343f4..e6143d10846 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -40,11 +40,44 @@ namespace chain {
 
 
 struct ChainTrainingOptions {
-  // Currently empty.
-
-  ChainTrainingOptions() { }
-
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+  BaseFloat l2_regularize;
+
+  // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
+  // each state to a special state with probability one, and then another
+  // epsilon-transition from that special state to each state, with probability
+  // leaky_hmm_coefficient times [initial-prob of destination state].  Imagine
+  // we make two copies of each state prior to doing this, version A and version
+  // B, with transition from A to B, so we don't have to consider epsilon loops-
+  // or just imagine the coefficient is small enough that we can ignore the
+  // epsilon loops.
+  BaseFloat leaky_hmm_coefficient;
+
+
+  // Cross-entropy regularization constant.  (e.g. try 0.1).  If nonzero,
+  // the network is expected to have an output named 'output-xent', which
+  // should have a softmax as its final nonlinearity.
+  BaseFloat xent_regularize;
+
+  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+                          xent_regularize(0.0) { }
+  
   void Register(OptionsItf *opts) {
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' training, applied to the output "
+                   "of the neural net.");
+    opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
+                   "that allows transitions from each HMM state to each other "
+                   "HMM state, to ensure gradual forgetting of context (can "
+                   "improve generalization).  For numerical reasons, may not be "
+                   "exactly zero.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for 'chain' training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
   }
 };
 
@@ -59,10 +92,13 @@ struct ChainTrainingOptions {
                             paths and constraints on the alignment as an FST
    @param [in] nnet_output  The output of the neural net; dimension must equal
                           ((supervision.num_sequences * supervision.frames_per_sequence) by
-                            den_graph.NumPdfs()).
+                            den_graph.NumPdfs()).  The rows are ordered as: all sequences
+                            for frame 0; all sequences for frame 1; etc.
    @param [out] objf       The [num - den] objective function computed for this
                            example; you'll want to divide it by 'tot_weight' before
                            displaying it.
+   @param [out] l2_term  The l2 regularization term in the objective function, if
+                           the --l2-regularize option is used.  To be added to 'o
    @param [out] weight     The weight to normalize the objective function by;
                            equals supervision.weight * supervision.num_sequences *
                            supervision.frames_per_sequence.
@@ -70,14 +106,22 @@ struct ChainTrainingOptions {
                            the neural-net output.  Only written to if non-NULL.
                            You don't have to zero this before passing to this function,
                            we zero it internally.
+   @param [out] xent_output_deriv  If non-NULL, then the numerator part of the derivative
+                           (which equals a posterior from the numerator forward-backward,
+                           scaled by the supervision weight) is written to here.  This will
+                           be used in the cross-entropy regularization code.  This value
+                           is also used in computing the cross-entropy objective value.
 */
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
-                              BaseFloat *tot_objf,
-                              BaseFloat *tot_weight,
-                              CuMatrixBase<BaseFloat> *nnet_output_deriv);
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
+                              
 
 
 }  // namespace chain
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 3bdf710c489..3f092879b6e 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -40,9 +40,11 @@ class NnetChainLdaStatsAccumulator {
 
   void AccStats(const NnetChainExample &eg) {
     ComputationRequest request;
-    bool need_backprop = false, store_stats = false;
+    bool need_backprop = false, store_stats = false,
+        need_xent = false, need_xent_deriv = false;
 
-    GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, &request);
+    GetChainComputationRequest(nnet_, eg, need_backprop, store_stats,
+                               need_xent, need_xent_deriv, &request);
 
     const NnetComputation &computation = *(compiler_.Compile(request));
 
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 4e32d280638..ed162d1d18b 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -25,6 +25,7 @@
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -48,6 +49,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         int32 frames_per_eg,
                         int32 frames_overlap_per_eg,
                         int32 frame_subsampling_factor,
+                        int32 cut_zero_frames,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetChainExampleWriter *example_writer) {
@@ -57,13 +59,36 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
       num_feature_frames_subsampled =
                              (num_feature_frames + frame_subsampling_factor - 1)/
                              frame_subsampling_factor;
-  if (num_output_frames != num_feature_frames_subsampled)
-    KALDI_ERR << "Mismatch in num-frames: chain supervision has "
-              << num_output_frames
-              << " versus features/frame_subsampling_factor = "
-              << num_feature_frames << " / " << frame_subsampling_factor
-              << ": check that --frame-subsampling-factor option is set "
-              << "the same as to chain-get-supervision.";
+  if (num_output_frames != num_feature_frames_subsampled) {
+    // we tolerate deviations in the num-frames if they are very small (1 output
+    // frame).
+
+    if (abs(num_output_frames - num_feature_frames_subsampled) > 1) {
+      KALDI_ERR << "Mismatch in num-frames: chain supervision has "
+                << num_output_frames
+                << " versus features/frame_subsampling_factor = "
+                << num_feature_frames << " / " << frame_subsampling_factor
+                << " = " << num_feature_frames_subsampled
+                << ": check that --frame-subsampling-factor option is set "
+                << "the same as to chain-get-supervision.";
+    }
+    int32 new_num_feature_frames =
+        num_output_frames * frame_subsampling_factor;
+    // add a few frames at the end to make it match up.
+    Matrix<BaseFloat> feats_new(new_num_feature_frames, feats.NumCols(),
+                                kUndefined);
+    int32 min_feature_frames = std::min<int32>(num_feature_frames,
+                                               new_num_feature_frames);
+    feats_new.RowRange(0, min_feature_frames).CopyFromMat(
+        feats.RowRange(0, min_feature_frames));
+    for (int32 i = num_feature_frames; i < new_num_feature_frames; i++)
+      feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1));
+    return ProcessFile(normalization_fst, feats_new, ivector_feats,
+                       supervision, utt_id, compress, left_context, right_context,
+                       frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
+                       cut_zero_frames, num_frames_written, num_egs_written,
+                       example_writer);
+  }
 
   KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
 
@@ -88,9 +113,15 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   // to the edge are not as accurate as they could be, because when we split we
   // don't know the correct alphas and betas).
   std::vector<Vector<BaseFloat> > deriv_weights;
-  chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                             range_starts_subsampled,
-                             &deriv_weights);
+  if (cut_zero_frames >= 0)
+    chain::GetWeightsForRangesNew(frames_per_eg_subsampled,
+                                  cut_zero_frames / frame_subsampling_factor,
+                                  range_starts_subsampled,
+                                  &deriv_weights);
+  else
+    chain::GetWeightsForRanges(frames_per_eg_subsampled,
+                               range_starts_subsampled,
+                               &deriv_weights);
 
   if (range_starts_subsampled.empty()) {
     KALDI_WARN << "No output for utterance " << utt_id
@@ -177,35 +208,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   return true;
 }
 
-void RoundUpNumFrames(int32 frame_subsampling_factor,
-                      int32 *num_frames,
-                      int32 *num_frames_overlap) {
-  if (*num_frames % frame_subsampling_factor != 0) {
-    int32 new_num_frames = frame_subsampling_factor *
-        (*num_frames / frame_subsampling_factor + 1);
-    KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
-              << " to a multiple of --frame-subsampling-factor="
-              << frame_subsampling_factor
-              << ", now --num-frames=" << new_num_frames;
-    *num_frames = new_num_frames;
-  }
-  if (*num_frames_overlap % frame_subsampling_factor != 0) {
-    int32 new_num_frames_overlap = frame_subsampling_factor *
-        (*num_frames_overlap / frame_subsampling_factor + 1);
-    KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
-              << " to a multiple of --frame-subsampling-factor="
-              << frame_subsampling_factor
-              << ", now --num-frames-overlap=" << new_num_frames_overlap;
-    *num_frames_overlap = new_num_frames_overlap;
-  }
-  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
-    KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
-              << "--num-frames=" << (*num_frames);
-  }
-
-}
-
-
 } // namespace nnet2
 } // namespace kaldi
 
@@ -237,6 +239,7 @@ int main(int argc, char *argv[]) {
     bool compress = true;
     int32 left_context = 0, right_context = 0, num_frames = 1,
         num_frames_overlap = 0, length_tolerance = 100,
+        cut_zero_frames = -1,
         frame_subsampling_factor = 1;
 
     std::string ivector_rspecifier;
@@ -244,6 +247,10 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
+    po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames "
+                "(measured before subsampling) to zero the derivative on each "
+                "side of a cut point (if set, activates new-style derivative "
+                "weights)");
     po.Register("left-context", &left_context, "Number of frames of left "
                 "context the neural net requires.");
     po.Register("right-context", &right_context, "Number of frames of right "
@@ -333,14 +340,15 @@ int main(int argc, char *argv[]) {
              || ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
                      << " and iVectors " << ivector_feats->NumRows()
-                     << "exceeds tolerance " << length_tolerance;
+                     << " exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
         if (ProcessFile(normalization_fst, feats, ivector_feats, supervision,
-                        key, compress, left_context, right_context, num_frames,
+                        key, compress,
+                        left_context, right_context, num_frames,
                         num_frames_overlap, frame_subsampling_factor,
-                        &num_frames_written, &num_egs_written,
+                        cut_zero_frames, &num_frames_written, &num_egs_written,
                         &example_writer))
           num_done++;
         else
diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc
index 71092f1bc27..5486a5f7fe9 100644
--- a/src/chainbin/nnet3-chain-train.cc
+++ b/src/chainbin/nnet3-chain-train.cc
@@ -70,17 +70,21 @@ int main(int argc, char *argv[]) {
     Nnet nnet;
     ReadKaldiObject(nnet_rxfilename, &nnet);
 
-    fst::StdVectorFst den_fst;
-    ReadFstKaldi(den_fst_rxfilename, &den_fst);
+    bool ok;
 
-    NnetChainTrainer trainer(opts, den_fst, &nnet);
+    {
+      fst::StdVectorFst den_fst;
+      ReadFstKaldi(den_fst_rxfilename, &den_fst);
 
-    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+      NnetChainTrainer trainer(opts, den_fst, &nnet);
 
-    for (; !example_reader.Done(); example_reader.Next())
-      trainer.Train(example_reader.Value());
+      SequentialNnetChainExampleReader example_reader(examples_rspecifier);
 
-    bool ok = trainer.PrintTotalStats();
+      for (; !example_reader.Done(); example_reader.Next())
+        trainer.Train(example_reader.Value());
+
+      ok = trainer.PrintTotalStats();
+    }
 
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
diff --git a/src/configure b/src/configure
index c90e9ba4ee0..0f6577dde17 100755
--- a/src/configure
+++ b/src/configure
@@ -52,7 +52,7 @@ function is_set {
 
 ##   First do some checks.  These verify that all the things are
 ##   here that should be here.
-if [ "`basename $PWD`" != "src" ]; then
+if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
   exit 1
 fi
@@ -177,7 +177,10 @@ do
   esac
 done
 
-
+# the idea here is that if you change the configuration options from using
+# CUDA to not using it, or vice versa, we want to recompile all parts of the
+# code that may use a GPU.  Touching this file is a way to force this.
+touch cudamatrix/cu-common.h 2>/dev/null
 
 function failure {
   echo "***configure failed: $* ***" >&2
@@ -400,11 +403,11 @@ function linux_configure_mkl_threading {
 }
 
 ##
-##CUDA is used in src/cudamatrix and src/nnet{,bin} only.
-##It is used to accelerate the neural network training,
-##the rest of kaldi is running on CPUs.
+## CUDA is used only in selected directories including src/cudamatrix, src/nnet*
+## and src/chain*.  It is used to accelerate the neural network training, the
+## rest of kaldi runs on CPUs.
 ##
-function linux_configure_cuda {
+function configure_cuda {
   #check for CUDA toolkit in the system
   if [ ! $CUDATKDIR ]; then
     for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
@@ -425,9 +428,13 @@ function linux_configure_cuda {
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
     if [ "`uname -m`" == "x86_64" ]; then
-      cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk
+      if [ "`uname`" == "Darwin" ]; then
+        sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
+      else
+        cat makefiles/cuda_64bit.mk >> kaldi.mk
+      fi
     else
-      cat makefiles/linux_cuda.mk >> kaldi.mk
+      cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
   else
     echo "CUDA will not be used! If you have already installed cuda drivers "
@@ -538,7 +545,7 @@ function linux_configure_debian_ubuntu {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -557,7 +564,7 @@ function linux_configure_debian_ubuntu3 {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -579,7 +586,7 @@ function linux_configure_debian7 {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -598,7 +605,7 @@ function linux_configure_redhat {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -619,7 +626,7 @@ function linux_configure_redhat_fat {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -671,7 +678,7 @@ function linux_configure_static {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -750,7 +757,7 @@ function linux_configure_dynamic {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -793,7 +800,7 @@ echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo "FSTROOT = $FSTROOT" >> kaldi.mk
 
 # Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4
-OPENFST_VER=`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`
+OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
 OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"`
 if [ $OPENFST_VER_NUM -ge 10400 ]; then
@@ -810,7 +817,7 @@ echo "Doing OS specific configurations ..."
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
 if [ "`uname`" == "Darwin"  ]; then
- $use_cuda && linux_configure_cuda
+ $use_cuda && configure_cuda
   echo "On Darwin: checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate.framework to compile on Darwin."
@@ -970,7 +977,7 @@ if [ "`uname`" == "Linux" ]; then
     fix_cxx_flag
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
 
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
     exit_success;
@@ -993,7 +1000,7 @@ if [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_clapack.mk >> kaldi.mk
     fix_cxx_flag
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
     exit_success;
@@ -1017,7 +1024,7 @@ if [ "`uname`" == "Linux" ]; then
     echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
     cat makefiles/linux_openblas.mk >> kaldi.mk
     fix_cxx_flag
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured OpenBLAS from $OPENBLASROOT."
     exit_success;
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 8718c49eea5..2b23bf0b621 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -51,19 +51,20 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
                                            dim3 *dimBlock) {
   KALDI_ASSERT(num_rows > 0 && num_cols > 0);
   int32 col_blocksize = 64, row_blocksize = 4;
-  while (num_cols + (num_cols / 2) <= col_blocksize &&
-         num_rows > 65536 * row_blocksize) {
+  while (col_blocksize > 1 &&
+         (num_cols + (num_cols / 2) <= col_blocksize ||
+          num_rows > 65536 * row_blocksize)) {
     col_blocksize /= 2;
     row_blocksize *= 2;
   }
 
-  KALDI_ASSERT(col_blocksize > 0 && "Matrix too large to process");
-
   dimBlock->x = col_blocksize;
   dimBlock->y = row_blocksize;
   dimBlock->z = 1;
   dimGrid->x = n_blocks(num_cols, col_blocksize);
   dimGrid->y = n_blocks(num_rows, row_blocksize);
+  KALDI_ASSERT(dimGrid->y <= 65536 &&
+               "Matrix has too many rows to process");
   dimGrid->z = 1;
 }
 #endif
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index ec7e69edad0..c34994ed6ce 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -435,7 +435,7 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
   // WARNING! the CUDA API is inconsistent accross versions!
 #ifdef _MSC_VER
   size_t mem_free, mem_total;
-  cuMemGetInfo_v2(handle_, &mem_free, &mem_total);
+  cuMemGetInfo_v2(&mem_free, &mem_total);
 #else
 #if (CUDA_VERSION >= 3020)
   // define the function signature type
@@ -447,9 +447,6 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
     // we will load cuMemGetInfo_v2 dynamically from libcuda.so
     // pre-fill ``safe'' values that will not cause problems
     mem_free = 1; mem_total = 1;
-#ifdef _MSC_VER
-    cuMemGetInfo_v2(handle_, &mem_free, &mem_total);
-#else
     // open libcuda.so
     void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
     if (NULL == libcuda) {
@@ -473,7 +470,6 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
       // close the library
       dlclose(libcuda);
     }
-#endif
   }
 #endif
   // copy the output values outside
@@ -574,6 +570,7 @@ CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
 CuDevice::~CuDevice() {
   if (Enabled()) {
     cublasDestroy(handle_);
+    cudaDeviceReset();
   }
 }
 
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 804bea1a217..a52c42cf347 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -2,7 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //                2013  Johns Hopkins University (author: Daniel Povey)
-//                2013  Hainan Xu    
+//                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 
@@ -44,7 +44,7 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
  */
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
 void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
@@ -58,7 +58,7 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim
 void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign,  MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
@@ -108,9 +108,9 @@ void cudaF_vec_min(const float* v, float* value, int dim);
 void cudaF_vec_max(const float* v, float* value, int dim);
 void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
 void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
-void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
-                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
-                            int N_col_stride, int threads_per_element, float beta);  
+void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
+                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
+                            int N_col_stride, int threads_per_element, float beta);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
 void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
 void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
@@ -141,6 +141,7 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, i
 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
 void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
 void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride);
 void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride);
@@ -161,15 +162,15 @@ void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in)
 void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int num_elements);
 void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data);
 void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
-void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
                       float *S, MatrixDim sdim);
 void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
                              const float *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);  
+                             const Int32Pair *indices);
 void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
                           const float *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);  
+                          const Int32Pair *indexes);
 void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
                          const Int32Pair *indices, int indices_size,
                          float *output);
@@ -177,19 +178,19 @@ void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
 void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                               const float *mat2, float *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
-  
+
 /*********************************************************
  * double CUDA kernel calls
  */
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
 void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
 void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
                             const double *vec, const double *mat2, int mat2_row_stride,
-                            int mat2_col_stride, double beta);  
+                            int mat2_col_stride, double beta);
 void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat);
 void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat);
 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat);
@@ -197,7 +198,7 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim
 void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
@@ -248,9 +249,9 @@ void cudaD_vec_min(const double* v, double* value, int dim);
 void cudaD_vec_max(const double* v, double* value, int dim);
 void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
 void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
-void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
-                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
-                            int N_col_stride, int threads_per_element, double beta);  
+void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
+                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
+                            int N_col_stride, int threads_per_element, double beta);
 void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
 void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
 void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
@@ -271,7 +272,7 @@ void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const d
 void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
                              const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
                              const double *D_data, int D_row_stride, int D_col_stride,
-                             double alpha, double beta);  
+                             double alpha, double beta);
 
 
 /*
@@ -283,6 +284,7 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
 void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
 void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
 void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride);
 void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride);
@@ -342,14 +344,14 @@ void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
 void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
                          const Int32Pair *indices, int indices_size,
                          double *output);
- 
+
 void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                              const double *mat2, double *mask, MatrixDim mat1_dim, 
+                              const double *mat2, double *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
-  
 
-  
-} // extern "C" 
+
+
+} // extern "C"
 
 #endif // HAVE_CUDA
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 00af3eb234a..d494be4169a 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -931,15 +931,15 @@ static void _add_diag_mat_mat(
   int v_idx = i / threads_per_element,   // v_idx is the index into v that we are supposed to
       sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells
                                          // us which block of elements we sum up.
-  if (v_idx >= v_dim) return;
-
-  Real sum = 0.0;
-  for (int j = sub_idx; j < M_cols; j += threads_per_element) {
-    int M_index = v_idx * M_row_stride + j * M_col_stride,
-        N_index = j * N_row_stride + v_idx * N_col_stride;
-    sum += M[M_index] * N[N_index];
+  if (v_idx < v_dim) {
+    Real sum = 0.0;
+    for (int j = sub_idx; j < M_cols; j += threads_per_element) {
+      int M_index = v_idx * M_row_stride + j * M_col_stride,
+          N_index = j * N_row_stride + v_idx * N_col_stride;
+      sum += M[M_index] * N[N_index];
+    }
+    temp_data[threadIdx.x] = sum;
   }
-  temp_data[threadIdx.x] = sum;
 
   // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data
                                      // that we want to sum up.
@@ -959,7 +959,7 @@ static void _add_diag_mat_mat(
     __syncthreads();
     num_total_threads = half_point;
   }
-  if (sub_idx == 0) {
+  if (sub_idx == 0 && v_idx < v_dim) {
     v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x];
   }
 }
@@ -1152,7 +1152,6 @@ __global__
 static void _pvec_sum(Real* v, Real* g, int dim, int size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int start = size * i;
-  if (start >= dim) return;
   int end = start + size;
   if (end > dim) end = dim;
   __shared__ Real row_data[CU1DBLOCK];
@@ -1752,6 +1751,19 @@ static void _diff_tanh(Real*eout, const Real*e, const Real*y, MatrixDim d, int e
     eout[dst_index] = (1.0 - y[y_index]*y[y_index]) * e[e_index];
 }
 
+template<typename Real>
+__global__
+static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j*d.stride, src_index = i + j*src_stride;
+  if(i < d.cols && j < d.rows) {
+    Real res = (x[src_index] > 0.0 ? 1.0 : 0.0);
+    y[dst_index] = res;
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
@@ -2145,7 +2157,6 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
 
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
-
 }
 
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -2471,6 +2482,10 @@ void cudaF_diff_tanh (dim3 Gr, dim3 Bl, float* eout, const float* e, const float
   _diff_tanh<<<Gr,Bl>>>(eout, e, y, d, e_stride, y_stride);
 }
 
+void cudaF_heaviside (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
 void cudaF_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
 }
@@ -2930,6 +2945,10 @@ void cudaD_diff_tanh (dim3 Gr, dim3 Bl, double* eout, const double* e, const dou
   _diff_tanh<<<Gr,Bl>>>(eout, e, y, d, e_stride, y_stride);
 }
 
+void cudaD_heaviside (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
 void cudaD_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x, MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fc1fbae54da..0ded2f794d3 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -4,7 +4,7 @@
 //                2013  Ehsan Variani
 //                2014  Johns Hopkins University (author: Daniel Povey)
 //                2013  Hainan Xu
-//                2013  Xiaohui Zhang    
+//                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -33,14 +33,14 @@
 #include "cudamatrix/cu-kernels-ansi.h"
 
 /*
- * In this file are C++ templated wrappers 
+ * In this file are C++ templated wrappers
  * of the ANSI-C CUDA kernels
  */
 
 namespace kaldi {
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 
 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
@@ -176,10 +176,10 @@ inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row
 inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim) { cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim); }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec,  float beta) { cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride, mat2_col_stride, vec, beta); }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } 
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); }
+
 
 
- 
 /*
  * CuVector
  */
@@ -194,8 +194,8 @@ inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(
 inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
 inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
 inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
-                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
+                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
                                   int N_col_stride, int threads_per_element, float beta) {
   cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
@@ -240,6 +240,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d
 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
 inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_heaviside(Gr,Bl,y,x,d,src_stride); }
 /*
 Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
 Gr: the number of rows
@@ -283,7 +284,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
   cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
 
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, 
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask,
                                MatrixDim mat1_dim, int mat2_stride, int mask_stride) {
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
@@ -293,7 +294,7 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f
 // double versions
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
@@ -378,8 +379,8 @@ inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_mi
 inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
 inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
 inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
-                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
+                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
                                   int N_col_stride, int threads_per_element, double beta) {
   cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
@@ -422,6 +423,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim
 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
 inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_heaviside(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
 
@@ -460,7 +462,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
   cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
 
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, 
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask,
                                     MatrixDim mat1_dim, int mat2_stride, int mask_stride) {
   cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 453cf4439fb..65a4c0c4af3 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -1,7 +1,7 @@
 // cudamatrix/cu-math.h
 
 // Copyright 2009-2012  Karel Vesely
-//                2013  Johns Hopkins University (Author: David Snyder) 
+//                2013  Johns Hopkins University (Author: David Snyder)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -28,9 +28,9 @@
 #include "base/timer.h"
 
 namespace kaldi {
-  
+
 namespace cu {
- 
+
 /// RegularizeL1 is a gradient step with l1 regularization added to the
 /// gradient.  We don't let the value cross over zero from positive to negative
 /// or vice versa, in a single step.  If an element tries to cross zero and is
@@ -40,9 +40,9 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
                   Real l1_penalty, Real learning_rate);
 
 /// Copies a permutation of src into tgt. The row permutation is specified in
-/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The 
+/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The
 /// dimensions of copy_from_idx must be equivalent to the number of rows in
-/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].  
+/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].
 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
                const CuArray<int32> &copy_from_idx,
@@ -52,10 +52,10 @@ void Randomize(const CuMatrixBase<Real> &src,
 /// The dimensions of tgt must be equivalent to the number of rows in src
 /// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
 /// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
-/// general case where i in [0..src.NumRows()-1], 
-/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] 
+/// general case where i in [0..src.NumRows()-1],
+/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1]
 /// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
-/// number of rows in src or less than 0 than the right side of the equation 
+/// number of rows in src or less than 0 than the right side of the equation
 /// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
 /// an index out of bounds.
 template<typename Real>
@@ -73,6 +73,13 @@ void Copy(const CuMatrixBase<Real> &src,
           const CuArray<int32> &copy_from_indices,
           CuMatrixBase<Real> *tgt);
 
+template <typename Real>
+void Group2norm(const CuMatrixBase<Real> &src,
+                CuMatrixBase<Real> *dest,
+                int32 group_stride);
+
+
+
 
 } // namespace cu
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
index b4b51cbc53b..aa6fcf6f44d 100644
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@@ -54,8 +54,13 @@ inline CuSubMatrix<Real>::CuSubMatrix(const Real *data,
   // in general if you use SubMatrix or CuSubMatrix, const-correctness is not
   // preserved (preserving it would require us duplicating the class and it
   // would have been a hassle).
+
+  // Note: we used to check that stride >= num_cols.  We no longer check for
+  // this as there are some situations where having stride < num_cols is useful,
+  // but beware because most if not all CUBLAS calls will crash when given
+  // such an input, even in a situation where it makes sense.
   KALDI_ASSERT((num_rows != 0) == (num_cols != 0) && stride >= 0 &&
-               num_rows >= 0 && num_cols >= 0 && num_cols <= stride);
+               num_rows >= 0 && num_cols >= 0 && stride >= 0);
 }
 
 
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index f50ded8c209..1c32de34d5c 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -298,6 +298,23 @@ template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+template<typename Real> void TestCuMatrixHeaviside(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.ApplyHeaviside();
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Heaviside" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
 
 template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
   BaseFloat time_in_secs = 0.025;
@@ -806,6 +823,8 @@ template<typename Real> void CudaMatrixSpeedTest() {
     TestCuMatrixCholesky<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixSigmoid<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixHeaviside<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuFindRowMaxId<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 2f675faad99..74419ea25ba 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -754,6 +754,25 @@ static void UnitTestCuMatrixApplyHeaviside() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixHeaviside() {
+
+  for (int32 i = 0; i < 1; i++) {
+    Matrix<Real> H(10 + Rand() % 60, 10 + Rand() % 20);
+    H.SetRandn();
+    H.Row(0).Set(0.0);
+    if (i == 2) { Matrix<Real> tmp(H, kTrans); H = tmp; }
+
+    CuMatrix<Real> cH(H);
+    CuMatrix<Real> cH2(H.NumRows(), H.NumCols(), kUndefined);
+    cH2.Heaviside(cH);
+    H.ApplyHeaviside();
+    Matrix<Real> H2(cH2);
+    AssertEqual(H, H2);
+  }
+}
+
+
 template<typename Real>
 static void UnitTestCuMatrixMulElements() {
   for (int32 i = 0; i < 2; i++) {
@@ -2445,6 +2464,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyFloor<Real>();
   UnitTestCuMatrixApplyCeiling<Real>();
   UnitTestCuMatrixApplyHeaviside<Real>();
+  UnitTestCuMatrixHeaviside<Real>();
   UnitTestCuMatrixMulElements<Real>();
   UnitTestCuMatrixDivElements<Real>();
   UnitTestCuMatrixMax<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index eb5a268d543..7e8780902a6 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -46,7 +46,8 @@ namespace kaldi {
 
 template<typename Real>
 void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
-                            MatrixResizeType resize_type) {
+                            MatrixResizeType resize_type,
+                            MatrixStrideType stride_type) {
   // This code does not currently support the other resize_type options.
   KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
   if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0);
@@ -54,7 +55,6 @@ void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
     if (resize_type == kSetZero) this->SetZero();
     return;
   }
-
   if (this->num_rows_ != 0)
     this->Destroy();
   if (rows == 0) return;
@@ -63,11 +63,19 @@ void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
     Timer tim;
     MatrixIndexT row_bytes = cols * sizeof(Real);
     size_t pitch;
-    this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
-        row_bytes, rows, &pitch));
-    this->num_rows_ = rows;
-    this->num_cols_ = cols;
-    this->stride_ = pitch / sizeof(Real);
+    if (stride_type == kDefaultStride) {
+      this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
+          row_bytes, rows, &pitch));
+      this->num_rows_ = rows;
+      this->num_cols_ = cols;
+      this->stride_ = pitch / sizeof(Real);
+    } else {  // kStrideEqualNumCols
+      size_t bytes = rows * cols * sizeof(Real);
+      this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(bytes));
+      this->num_rows_ = rows;
+      this->num_cols_ = cols;
+      this->stride_ = cols;
+    }
     if (resize_type == kSetZero) this->SetZero();
     CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed());
   } else
@@ -75,7 +83,7 @@ void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
   { // Let the initializer of Matrix<Real> handle the allocation,
     // and then just do Swap which will switch the pointers.
     // This wastes a few instructions but is simple to code.
-    Matrix<Real> mat(rows, cols, resize_type);
+    Matrix<Real> mat(rows, cols, resize_type, stride_type);
     this->Swap(&mat);
   }
 }
@@ -1895,6 +1903,7 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data());
+      CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
@@ -2016,6 +2025,26 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                   src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    Mat().Heaviside(src.Mat());
+  }
+}
 
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExp() {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index fd4c642ab7f..fec26424ef8 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -254,6 +254,11 @@ class CuMatrixBase {
   /// element by element, x = 1 / (1 + exp(-x))
   void Sigmoid(const CuMatrixBase<Real> &src);
 
+  /// Set each element to the Heaviside function of the corresponding element
+  /// of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note:
+  /// in general, there are different ways to deal with the situation when x==0.]
+  void Heaviside(const CuMatrixBase<Real> &src);
+
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -336,7 +341,9 @@ class CuMatrixBase {
   ///< The output will be set zero. If include_sign is true, it will
   ///< multiply the result by the sign of the input.
   void ApplyPowAbs(Real power, bool include_sign=false);
-  void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
+  /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
+  /// See also Heaviside().
+  void ApplyHeaviside();
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
@@ -425,9 +432,9 @@ class CuMatrixBase {
 
   /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
   void AddMatMatElements(const Real alpha,
-                    const CuMatrixBase<Real>& A,
-                    const CuMatrixBase<Real>& B,
-                    const Real beta);
+                         const CuMatrixBase<Real>& A,
+                         const CuMatrixBase<Real>& B,
+                         const Real beta);
 
   /// this <-- beta*this + alpha*A*B
   void AddMatSp(const Real alpha,
@@ -619,8 +626,9 @@ class CuMatrix: public CuMatrixBase<Real> {
 
   /// Constructor with memory initialisation
   CuMatrix(MatrixIndexT rows, MatrixIndexT cols,
-           MatrixResizeType resize_type = kSetZero) {
-    Resize(rows, cols, resize_type);
+           MatrixResizeType resize_type = kSetZero,
+           MatrixStrideType stride_type = kDefaultStride) {
+    Resize(rows, cols, resize_type, stride_type);
   }
 
   // Note: we had to remove the "explicit" keyword due
@@ -679,7 +687,8 @@ class CuMatrix: public CuMatrixBase<Real> {
 
   /// Allocate the memory
   void Resize(MatrixIndexT rows, MatrixIndexT cols,
-              MatrixResizeType resize_type = kSetZero);
+              MatrixResizeType resize_type = kSetZero,
+              MatrixStrideType stride_type = kDefaultStride);
 
   void Swap(Matrix<Real> *mat);
   void Swap(CuMatrix<Real> *mat);
@@ -782,8 +791,8 @@ template<typename Real>
 template<typename OtherReal>
 Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
                      MatrixTransposeType trans) {
-  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
-  else Init(M.NumCols(), M.NumRows());
+  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols(), kDefaultStride);
+  else Init(M.NumCols(), M.NumRows(), kDefaultStride);
   M.CopyToMat(this, trans);
 }
 
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index a32e136f62e..9b7aa97776a 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -22,7 +22,7 @@
 #include <iostream>
 #include <vector>
 #include <cstdlib>
-
+#include <cmath>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "cudamatrix/cu-matrix.h"
@@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() {
 }
 
 
-template<typename Real, typename OtherReal> 
+template<typename Real, typename OtherReal>
 static void UnitTestCuVectorCopyFromVec() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSubVector() {
   for (int32 iter = 0 ; iter < 10; iter++) {
     int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3,
@@ -97,7 +97,7 @@ static void UnitTestCuSubVector() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorMulTp() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() {
     A.SetRandn();
     TpMatrix<Real> B(dim);
     B.SetRandn();
-    
+
     CuVector<Real> C(A);
     CuTpMatrix<Real> D(B);
 
@@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() {
     B.SetRandn();
     Vector<Real> C(dim);
     C.SetRandn();
-    
+
     CuVector<Real> D(A);
     CuTpMatrix<Real> E(B);
-    CuVector<Real> F(C); 
+    CuVector<Real> F(C);
 
     A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
     D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
@@ -160,7 +160,7 @@ template<typename Real> void CuVectorUnitTestAddVec() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243;
   vec1.AddVec(alpha, vec2);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -177,7 +177,7 @@ template<typename Real> void CuVectorUnitTestAddVecCross() {
       CuVector<Real> vec1_orig(vec1);
       Real alpha = 0.43243;
       vec1.AddVec(alpha, vec2);
-  
+
       for (int32 i = 0; i < M; i++)
         AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
     } else {
@@ -198,7 +198,7 @@ template<typename Real> void CuVectorUnitTestAddVecExtra() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243, beta = 1.4321;
   vec1.AddVec(alpha, vec2, beta);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -268,6 +268,20 @@ template<typename Real> static void UnitTestCuVectorReplaceValue() {
   }
 }
 
+template<typename Real> static void UnitTestCuVectorSum() {
+  for (int32 i = 0; i < 200; i++) {
+    int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500);
+    int32 dim = RandInt(10, 12000) + start_dim + end_dim;
+    Real quiet_nan = nan("");  // this is from <cmath>.
+    Vector<BaseFloat> vec(start_dim + dim + end_dim);
+    vec.Range(0, start_dim).Set(quiet_nan);
+    vec.Range(start_dim, dim).Set(1.0);
+    vec.Range(start_dim + dim, end_dim).Set(quiet_nan);
+    BaseFloat sum = vec.Range(start_dim, dim).Sum();
+    KALDI_ASSERT(ApproxEqual(sum, dim));
+  }
+}
+
 template<typename Real> void CuVectorUnitTestInvertElements() {
   // Also tests MulElements();
   int32 M = 256 + Rand() % 100;
@@ -288,7 +302,7 @@ template<typename Real> void CuVectorUnitTestSum() {
     CuVector<Real> A(dim), ones(dim);
     A.SetRandn();
     ones.Set(1.0);
-    
+
     AssertEqual(VecVec(A, ones), A.Sum());
   }
 }
@@ -320,7 +334,7 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   }
   Matrix<Real> matrix(cu_matrix), matrix2(M, N);
   CuMatrix<Real> matrix3(M, N);
-  
+
   CuVector<Real> vector(M * N), vector2(M * N);
   vector.CopyRowsFromMat(cu_matrix);
   vector2.CopyRowsFromMat(matrix);
@@ -328,8 +342,8 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   matrix3.CopyRowsFromVec(Vector<Real>(vector2));
   Vector<Real> vector3(M * N);
   vector3.CopyRowsFromMat(cu_matrix);
-                                         
-  
+
+
   for(int32 j = 0; j < M*N; j++) {
     if (Rand() % 500 == 0) { // random small subset (it was slow)
       KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
@@ -412,7 +426,7 @@ template<typename Real> void CuVectorUnitTestNorm() {
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
 }
-               
+
 
 template<typename Real> void CuVectorUnitTestMin() {
   for (int32 p = 0; p < 5; p++) {
@@ -496,7 +510,7 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
     int32 i = cu_vector.ApplyFloor(floor);
     int32 j = vector.ApplyFloor(floor);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -517,7 +531,7 @@ template<typename Real> void CuVectorUnitTestApplyCeiling() {
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
     int32 i = cu_vector.ApplyCeiling(floor);
     int32 j = vector.ApplyCeiling(floor);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -540,7 +554,7 @@ template<typename Real> void CuVectorUnitTestApplyPow() {
     BaseFloat pow = -2 + (Rand() % 5);
     cu_vector.ApplyPow(pow);
     vector.ApplyPow(pow);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -579,7 +593,7 @@ template<typename Real> void CuVectorUnitTestAddDiagMat2() {
     cu_mat_orig.SetRandn();
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> cu_mat(cu_mat_orig, trans);
-    
+
     Vector<Real> vector(cu_vector);
     Matrix<Real> mat(cu_mat);
 
@@ -604,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() {
     MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
     MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
-    
+
     v.SetRandn();
     CuVector<Real> w(v);
 
     w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
-    
+
     {
       CuVector<Real> w2(v);
       CuMatrix<Real> MN(dimM, dimM);
@@ -669,7 +683,7 @@ template<typename Real> void CuVectorUnitTestAddSpVec() {
     CuSpMatrix<Real> mat_cu(M);
     mat_cu.SetRandn();
     SpMatrix<Real> mat(mat_cu);
-    
+
     BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5);
     dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
     dst.AddSpVec(alpha, mat, src, beta);
@@ -695,6 +709,7 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestScale<Real>();
   CuVectorUnitTestSum<Real>();
   CuVectorUnitTestInvertElements<Real>();
+  UnitTestCuVectorSum<Real>();
   CuVectorUnitTestAddRowSumMat<Real>();
   CuVectorUnitTestAddColSumMat<Real>();
   UnitTestCuVectorReplaceValue<Real>();
@@ -708,8 +723,8 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestCopyDiagFromPacked<Real>();
   CuVectorUnitTestCopyDiagFromMat<Real>();
   CuVectorUnitTestCopyCross<Real>();
-  CuVectorUnitTestCopyCross2<Real>();  
-  CuVectorUnitTestNorm<Real>();  
+  CuVectorUnitTestCopyCross2<Real>();
+  CuVectorUnitTestNorm<Real>();
   CuVectorUnitTestApplyExp<Real>();
   CuVectorUnitTestApplyLog<Real>();
   CuVectorUnitTestApplyFloor<Real>();
@@ -732,10 +747,10 @@ int main(int argc, char *argv[]) {
   const char *usage = "Usage: cu-vector-test [options]";
 
   ParseOptions po(usage);
-  std::string use_gpu = "yes";    
+  std::string use_gpu = "yes";
   po.Register("use-gpu", &use_gpu, "yes|no|optional");
   po.Read(argc, argv);
-  
+
   if (po.NumArgs() != 0) {
     po.PrintUsage();
     exit(1);
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 64f41720869..6deb3809d85 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -279,7 +279,6 @@ Real CuVectorBase<Real>::Sum() const {
       CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
       return tmp.Sum();
     } else {
-      if (dim_ == 0) return 0.0;
       CuVector<Real> tmp(1, kUndefined);
       int dimBlock(CU1DBLOCK);
       int dimGrid = 1; // only 1 block here. we have loops in each thread.
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index 158248cc445..3aeef0bf24a 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -54,7 +54,7 @@ struct LatticeFasterDecoderConfig {
   // LatticeFasterDecoder class itself, but by the code that calls it, for
   // example in the function DecodeUtteranceLatticeFaster.
   fst::DeterminizeLatticePhonePrunedOptions det_opts;
-  
+
   LatticeFasterDecoderConfig(): beam(16.0),
                                 max_active(std::numeric_limits<int32>::max()),
                                 min_active(200),
@@ -99,7 +99,7 @@ class LatticeFasterDecoder {
   typedef Arc::Label Label;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
-  
+
   // instantiate this class once for each thing you have to decode.
   LatticeFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                        const LatticeFasterDecoderConfig &config);
@@ -117,7 +117,7 @@ class LatticeFasterDecoder {
   const LatticeFasterDecoderConfig &GetOptions() const {
     return config_;
   }
-  
+
   ~LatticeFasterDecoder();
 
   /// Decodes until there are no more frames left in the "decodable" object..
@@ -365,8 +365,9 @@ class LatticeFasterDecoder {
   const fst::Fst<fst::StdArc> &fst_;
   bool delete_fst_;
   std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeFasterDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
@@ -409,7 +410,7 @@ class LatticeFasterDecoder {
 
   void ClearActiveTokens();
 
-  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder);  
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder);
 };
 
 
diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h
index 30adb6df302..b69b5492fb7 100644
--- a/src/decoder/lattice-faster-online-decoder.h
+++ b/src/decoder/lattice-faster-online-decoder.h
@@ -62,7 +62,7 @@ class LatticeFasterOnlineDecoder {
     BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
     bool Done() { return tok == NULL; }
   };
-  
+
   // instantiate this class once for each thing you have to decode.
   LatticeFasterOnlineDecoder(const fst::Fst<fst::StdArc> &fst,
                              const LatticeFasterDecoderConfig &config);
@@ -80,7 +80,7 @@ class LatticeFasterOnlineDecoder {
   const LatticeFasterDecoderConfig &GetOptions() const {
     return config_;
   }
-  
+
   ~LatticeFasterOnlineDecoder();
 
   /// Decodes until there are no more frames left in the "decodable" object..
@@ -107,12 +107,12 @@ class LatticeFasterOnlineDecoder {
   bool GetBestPath(Lattice *ofst,
                    bool use_final_probs = true) const;
 
-  
+
   /// This function does a self-test of GetBestPath().  Returns true on
   /// success; returns false and prints a warning on failure.
   bool TestGetBestPath(bool use_final_probs = true) const;
-  
-  
+
+
   /// This function returns an iterator that can be used to trace back
   /// the best path.  If use_final_probs == true and at least one final state
   /// survived till the end, it will use the final-probs in working out the best
@@ -133,7 +133,7 @@ class LatticeFasterOnlineDecoder {
   /// while leaving its "nextstate" variable unchanged.
   BestPathIterator TraceBackBestPath(
       BestPathIterator iter, LatticeArc *arc) const;
-  
+
   /// Outputs an FST corresponding to the raw, state-level
   /// tracebacks.  Returns true if result is nonempty.
   /// If "use_final_probs" is true AND we reached the final-state
@@ -152,7 +152,7 @@ class LatticeFasterOnlineDecoder {
                            bool use_final_probs,
                            BaseFloat beam) const;
 
-  
+
   /// InitDecoding initializes the decoding, and should only be used if you
   /// intend to call AdvanceDecoding().  If you call Decode(), you don't need to
   /// call this.  You can also call InitDecoding if you have already decoded an
@@ -334,7 +334,7 @@ class LatticeFasterOnlineDecoder {
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem);
-  
+
   /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
   /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
   BaseFloat ProcessEmitting(DecodableInterface *decodable);
@@ -343,7 +343,7 @@ class LatticeFasterOnlineDecoder {
   /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
   /// preceding ProcessEmitting().
   void ProcessNonemitting(BaseFloat cost_cutoff);
-  
+
   // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.  It is indexed by frame-index
@@ -361,9 +361,10 @@ class LatticeFasterOnlineDecoder {
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
   bool delete_fst_;
-  std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  std::vector<BaseFloat> cost_offsets_;  // This contains, for each
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeFasterDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
diff --git a/src/decoder/lattice-tracking-decoder.h b/src/decoder/lattice-tracking-decoder.h
index 91484b56c60..0737ca3db36 100644
--- a/src/decoder/lattice-tracking-decoder.h
+++ b/src/decoder/lattice-tracking-decoder.h
@@ -74,7 +74,7 @@ struct LatticeTrackingDecoderConfig {
 
   }
   void Check() const {
-    KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 
+    KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0
                  && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0
                  && extra_beam >= 0.0 && max_beam >= beam);
   }
@@ -135,7 +135,7 @@ class LatticeTrackingDecoder {
   /// format.
   bool Decode(DecodableInterface *decodable,
               const fst::StdVectorFst &arc_graph);
-  
+
   /// says whether a final-state was active on the last frame.  If it was not, the
   /// lattice (or traceback) will end with states that are not final-states.
   bool ReachedFinal() const { return final_active_; }
@@ -167,7 +167,7 @@ class LatticeTrackingDecoder {
   /// final-probs as one.
   bool GetLattice(fst::MutableFst<CompactLatticeArc> *ofst,
                   bool use_final_probs = true) const;
-  
+
  private:
   struct Token;
   // ForwardLinks are the links from a token to a token on the next frame.
@@ -181,13 +181,13 @@ class LatticeTrackingDecoder {
     ForwardLink *next; // next in singly-linked list of forward links from a
                        // token.
     inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
-                       BaseFloat graph_cost, BaseFloat acoustic_cost, 
+                       BaseFloat graph_cost, BaseFloat acoustic_cost,
                        ForwardLink *next):
         next_tok(next_tok), ilabel(ilabel), olabel(olabel),
-        graph_cost(graph_cost), acoustic_cost(acoustic_cost), 
+        graph_cost(graph_cost), acoustic_cost(acoustic_cost),
         next(next) { }
-  };  
-  
+  };
+
   // Token is what's resident in a particular state at a particular time.
   // In this decoder a Token actually contains *forward* links.
   // When first created, a Token just has the (total) cost.    We add forward
@@ -200,19 +200,19 @@ class LatticeTrackingDecoder {
     // that any of the currently active states at the decoding front may
     // eventually succeed (e.g. if you were to take the currently active states
     // one by one and compute this difference, and then take the minimum).
-    
+
     ForwardLink *links; // Head of singly linked list of ForwardLinks
-    
+
     Token *next; // Next in list of tokens for this frame.
-    
+
     StateId lat_state; // current state in graph arc lattice from first pass decoding
     // lat_state == fst::kNoStateId means that this token is not tracked
-    
+
     inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
                  Token *next, StateId lat_state): tot_cost(tot_cost), extra_cost(extra_cost),
                  links(links), next(next), lat_state(lat_state) { }
     inline void DeleteForwardLinks() {
-      ForwardLink *l = links, *m; 
+      ForwardLink *l = links, *m;
       while (l != NULL) {
         m = l->next;
         delete l;
@@ -221,7 +221,7 @@ class LatticeTrackingDecoder {
       links = NULL;
     }
   };
-  
+
   // head and tail of per-frame list of Tokens (list is in topological order),
   // and something saying whether we ever pruned it using PruneForwardLinks.
   struct TokenList {
@@ -231,7 +231,7 @@ class LatticeTrackingDecoder {
     TokenList(): toks(NULL), must_prune_forward_links(true),
                  must_prune_tokens(true) { }
   };
-  
+
   typedef HashList<StateId, Token*>::Elem Elem;
 
   void PossiblyResizeHash(size_t num_toks);
@@ -248,7 +248,7 @@ class LatticeTrackingDecoder {
   // lat_state is the next state in the arc graph lattice
   inline Token *FindOrAddToken(StateId state, StateId lat_state, int32 frame,
                                BaseFloat tot_cost, bool *changed);
-  
+
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -267,13 +267,13 @@ class LatticeTrackingDecoder {
   // on the final frame.  If there are final tokens active, it uses
   // the final-probs for pruning, otherwise it treats all tokens as final.
   void PruneForwardLinksFinal(int32 frame);
-  
+
   // Prune away any tokens on this frame that have no forward links.
   // [we don't do this in PruneForwardLinks because it would give us
   // a problem with dangling pointers].
   // It's called by PruneActiveTokens if any forward links have been pruned
   void PruneTokensForFrame(int32 frame);
-  
+
   // Go backwards through still-alive tokens, pruning them.  note: cur_frame is
   // where hash toks_ are (so we do not want to mess with it because these tokens
   // don't yet have forward pointers), but we do all previous frames, unless we
@@ -286,7 +286,7 @@ class LatticeTrackingDecoder {
   /// Version of PruneActiveTokens that we call on the final frame.
   /// Takes into account the final-prob of tokens.
   void PruneActiveTokensFinal(int32 cur_frame);
-  
+
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem);
@@ -311,9 +311,10 @@ class LatticeTrackingDecoder {
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
-  std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  std::vector<BaseFloat> cost_offsets_;  // This contains, for each
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeTrackingDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
@@ -331,9 +332,9 @@ class LatticeTrackingDecoder {
   // to the caller, who then has to call toks_.Delete(e) for each one.  It was designed
   // this way for convenience in propagating tokens from one frame to the next.
   void ClearToks(Elem *list);
-  
+
   void ClearActiveTokens();
-  
+
 };
 
 
diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox
index ba42ea12370..31fa62d3389 100644
--- a/src/doc/glossary.dox
+++ b/src/doc/glossary.dox
@@ -26,7 +26,7 @@
  search function of your browser.  For convenience the definition of each
  term's section is preceded and followed by a colon, so for
  instance, typing ctrl-f ":lattice:" would take you to the section for "lattice".
- 
+
 
 <div style="text-indent: -1.5em;  padding-left: 1.5em;">
 
@@ -37,7 +37,7 @@ synonymous with a sequence of <b>transition-ids</b>.  Most of the time an
 alignment is derived from aligning the reference transcript of an utterance,
 in which case it is called a <b>forced alignment</b>.  <b>lattices</b> also
 contain alignment information as sequences of transition-ids for each word
-sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows 
+sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows
 alignments in a human-readable format.
 
 <b>:forced alignment:</b> see <b>alignment</b>.
@@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit
 This object is generally written at the start of model files.  The program
 \ref bin/show-transitions.cc "show-transitions" shows these.
 
+<b>:G.fst:</b>  The grammar FST <code>G.fst</code> which lives in the
+  <code>data/lang/</code> directory in the scripts (see \ref data_prep_lang) represents
+  the language model in a Finite State Transducer format (see www.openfst.org).
+ For the most part it is an acceptor, meaning the input and output symbols on the
+ arcs are the same, but for statistical language models with backoff, the backoff
+ arcs have the "disambiguation symbol" <code>#0</code> on the input side only.
+ For many purposes you'll want to get rid of the disambiguation symbols
+  using the command <code>fstproject --project_output=true</code>.  The disambiguation symbols
+ are needed during graph compilation to make the FST determinizable, but for things
+ like language-model rescoring you don't want them.
+
+
 </div>
 
 */
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index 9935fa52711..938321fd7b2 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -447,9 +447,10 @@ We now explain what these three scales do:
    when we add the self-loop, let the probability mass given to the self-loop be p
    and the mass given to the rest be (1-p).  We add a self-loop with log-probability
    self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other 
-   log transition probabilities
-   out of that state.  In typical topologies, the self-loop scale is the only scale
-   that matters.
+   log transition probabilities out of that state.  (Note: in the initial stage of
+   graph creation we create a graph without self-loops, and with the non-self-loop
+   transition probabilities renormalized to sum to one).  In typical topologies, the 
+   self-loop scale is the only scale that matters.
 
 The reason we feel it might make sense to apply a different probability scale to
 the self-loops versus the normal transition scale is we think they could be
diff --git a/src/doc/install.dox b/src/doc/install.dox
index 0ffb2b1220f..b40b139a8dc 100644
--- a/src/doc/install.dox
+++ b/src/doc/install.dox
@@ -29,8 +29,8 @@
   possibly including unfinished and experimental features, can
    be downloaded by typing into a shell:
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
-    cd kaldi-trunk
+    git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream
+    cd kaldi
   \endverbatim
  If you want to get updates and bug fixes you can go to some checked-out
  directory, and type
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index ee2bc11d8b9..df9f96e8430 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -32,13 +32,13 @@ namespace kaldi {
 
   The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of
   ways we can split the data by asking about, say, the left phone, the right
-  phone, the central phone, the state we're in, and so on.  
+  phone, the central phone, the state we're in, and so on.
  The algorithm we implement is similar to the standard algorithm,
  see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by
  Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally
   optimal question, i.e. the one that gives the most likelihood increase, supposing
-  we model the data on each side of the split by a single Gaussian. 
- Differences from standard implementations include added flexibility 
+  we model the data on each side of the split by a single Gaussian.
+ Differences from standard implementations include added flexibility
  about how to configure the tree roots; the ability to ask questions about the HMM-state and
  the central phone; and the fact that by default in the Kaldi scripts, the questions
  are automatically generated by a top-down binary clustering of the data, which means
@@ -50,7 +50,7 @@ namespace kaldi {
  be the tree roots.  For how to configure it using the standard scripts, see
  \ref data_prep.   In practice we generally let each tree-root correspond to a "real phone", meaning
  that we group together all word-position-dependent, tone-dependent or stress-dependent versions of
- each phone into one group that becomes a tree root.  
+ each phone into one group that becomes a tree root.
 
   The rest of this page mostly gives details at the code level of what is happening.
 
@@ -74,7 +74,7 @@ below summarizes these values:
 </table>
 
 N is the width of the context window and P is the identity of the designated
-"central phone".  Normally P is exactly in the middle of the window 
+"central phone".  Normally P is exactly in the middle of the window
 (hence the name "central-position"); for example, with N=3, we would normally
 have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and
 N=3 means two phones of left context and no right context at all.
@@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th
 phone which may or may not actually be the central phone of the context window.
 
 A vector of integers representing a typical triphone context window might be:
-\code 
-// probably not valid C++ 
+\code
+// probably not valid C++
 vector<int32> ctx_window = { 12, 15, 21 };
 \endcode
-Assuming N=3 and P=1, this would represent phone 15 with 
+Assuming N=3 and P=1, this would represent phone 15 with
 a right context of 21 and a left context of 12.  The way we handle end
 effects is using zero (which is not a valid phone because it's reserved in
 OpenFst for the epsilon meaning "no symbol"), so for instance:
-\code 
+\code
 vector<int32> ctx_window = { 12, 15, 0 };
 \endcode
 means phone 15 with a left-context of 12 and no right-context because it's the
 end of the utterance.  At the end of utterance in particular, the use of zero
 this way may be a little unexpected because the last "phone" is actually the
-subsequential symbol "$" (see \ref graph_c), but for the convenience 
+subsequential symbol "$" (see \ref graph_c), but for the convenience
 of the decision-tree code we don't
 put the subsequential symbol in these context windows, we put zero.  Note
 that if we had N=3 and P=2, the above context window would be invalid because
 its P'th element would be zero which is not a real phone; also of course,
-if we had a tree with N=1, neither of the windows above would be valid because they 
+if we had a tree with N=1, neither of the windows above would be valid because they
 are the wrong size.  In the monophone case, we would have a window like:
-\code 
+\code
 vector<int32> ctx_window = { 15 };
 \endcode
 so monophone systems are just treated as a special case of context-dependent
-systems, with a window size N of 1 and a tree that doesn't do anything very 
+systems, with a window size N of 1 and a tree that doesn't do anything very
 interesting.
 
 
@@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object).  If the program gmm-init-mono
 receives an option called --shared-phones, it will share the pdfs between
 specified sets of phones; otherwise it makes all the phones separate.
 
-After training a monophone system starting from a flat start, we take 
+After training a monophone system starting from a flat start, we take
 the monophone alignments
-and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc 
+and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc
 "acc-tree-stats") to accumulate statistics for training the tree.  This program is
 not limited to reading in monophone alignments; it works from context-dependent
 alignments too so we can build trees based on e.g. triphone alignments.
-The statistics for tree building are written to disk as the type \ref BuildTreeStatsType 
-(see \ref treei_stats).  
+The statistics for tree building are written to disk as the type \ref BuildTreeStatsType
+(see \ref treei_stats).
 The function AccumulateTreeStats() takes the values N and P, as explained in the
 previous section; the command-line programs will set these by default to 3 and
 1 respectively, but this can be overridden using the --context-width
-and --central-position options.  The program \ref acc-tree-stats.cc 
+and --central-position options.  The program \ref acc-tree-stats.cc
 "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is
 not required even if there are context-independent phones; it is just
-a mechanism to reduce the size of the statistics.  
+a mechanism to reduce the size of the statistics.
 For context-independent hones, the program will accumulate the
 corresponding statistics without the keys corresponding to the left and right phones defined
 (c.f. \ref treei_event_map).
 
 When the statistics have been
-accumulated we use the program \ref build-tree.cc "build-tree" to 
-build the tree.  This outputs the tree.  
+accumulated we use the program \ref build-tree.cc "build-tree" to
+build the tree.  This outputs the tree.
 The program \ref build-tree.cc "build-tree" requires three things:
   - The statistics (of type BuildTreeStatsType)
   - The questions config (of type Questions)
@@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics
 by the program cluster-phones.  The roots file specifies sets of phones
 that are goint to have shared roots in the decision-tree clustering process, and says
 for each phone set the following two things:
-  - "shared" or "not-shared" says whether or not there should be separate  
-    roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states,
-    in the typical case), or if the roots
-    should be shared.  If we are going to be splitting (the "split" option
-    below) we enforce that the roots should be shared.
+
+  - "shared" or "not-shared" says whether or not there should be separate roots
+    for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the
+    typical case), or if the roots should be shared.  If it says "shared" there
+    will be a single tree-root for all HMM states (e.g. all three states, in a
+    normal topology) ; if "not-shared" there would be (e.g.) three tree-roots,
+    one for each pdf-class.
+
   - "split" or "not-split" says whether or not the decision tree splitting
     should actually be done for the roots in question (for silence, we
-    typically don't split).
+    typically don't split).  If the line says "split" (the normal case) then
+    we do the decision tree splitting.  If it says "not-split" then no splitting
+    is done and the roots are left un-split.
 
-Be careful because the notation is a bit tricky.  The "shared" on the line of
-the roots file is about whether we will share all the 3 HMM-states of the phone
-in a single tree root.  But we will always share together the roots of all the phones that
-appear on a single lines of the roots file.  This is not configurable via these
-strings because if you don't want to share them, you can just put them on
-separate lines of the roots file. 
+
+The following will clarify some aspects of how this works:
+
+ - If we say "shared split", then
+   even though there is one root node for all three HMM-states, the different
+   HMM states can still get different leaves because the tree can ask questions
+   about the pdf-class as well as about phonetic context.
+
+ - We always share together the roots of all the phones that appear on a single
+   lines of the roots file.  This is not configurable via these strings because
+   if you don't want to share the phones' roots, you can just put them on
+   separate lines of the roots file.
 
 Below is an example of a roots file; this assumes that phone 1 is silence
 and all the other phones have separate roots.
@@ -185,14 +196,14 @@ shared split 3
 ...
 shared split 28
 \endverbatim
-Having multiple phones on the same line is most useful when we have things like position and 
+Having multiple phones on the same line is most useful when we have things like position and
 stress-dependent phones; in this case each "real" phone would correspond
 to a set of integer phone ids.  In that case we share the roots for all
 versions of a particular underlying phone.  Below is an example of a roots file
-for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; 
+for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form;
 it would have to be converted to integer form before being read by Kalid):
 \verbatim
-not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S 
+not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S
 shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S
 shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S
 shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S
@@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each
 For instance, in this case, if the phone AY was seen in at least some combination of stress and
 word-position, we would be OK.
 
-In this example, we have various word-position-dependent variants of silence and so on.  
+In this example, we have various word-position-dependent variants of silence and so on.
 In this example they will all share their pdf's because they are on the same line and are
 "not-split"-- but they may have different transition parameters.  In fact, most of these
 variants of silence would never be used as silence never appears inside words; this is for
@@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali".
  pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR
  system).  They are originally assigned when the tree is first built.  Depending
  how the tree is built, it may or may not be possible to say, for each pdf-id, which phone
- it corresponds to.  
+ it corresponds to.
 
 \section tree_ctxdep Context dependency objects
 
  The ContextDependencyInterface object is a virtual base-class for the
  tree that specifies how it interacts with the graph-building code.  This
- interface contains only four functions: 
+ interface contains only four functions:
     - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns
         the value of N (context-width) that the tree requires.
     - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns
@@ -264,8 +275,8 @@ else
 \endcode
 
 The only class that currently inherits from ContextDependencyInterface
-is the class ContextDependency, which has marginally richer interface; 
-the only important addition is the function \ref ContextDependency::GetPdfInfo 
+is the class ContextDependency, which has marginally richer interface;
+the only important addition is the function \ref ContextDependency::GetPdfInfo
 "GetPdfInfo" which is used by the TransitionModel class to work out which
 phones a particular pdf can possibly correspond to (this function could
 be emulated given only the interface of ContextDependencyInterface, by
@@ -274,7 +285,7 @@ enumerating all contexts).
 The ContextDependency object is actually a fairly thin wrapper for the
 EventMap object; see \ref tree_internals.  We wanted to hide
 the actual implementation of the tree as much as possible to make it
-easy to refactor the code later if needed. 
+easy to refactor the code later if needed.
 
 \section tree_example An example of a decision tree
 
@@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format.
 In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that
 splits on key 1, which is the central phone.  In square brackets are a contiguous range
 of phone-ids.  As it happens, these don't represent a question, but are just a way of
-splitting on phones so we can get to the "real" decision trees which are per phone.  
+splitting on phones so we can get to the "real" decision trees which are per phone.
 The issue is that this tree was built with "shared roots", so there are various phone-ids,
 corresponding to different word-position-and-stress-marked versions of the same phone,
 that share the root.  We can't use a TableEventMap (TE) at the top level of the tree,
 or we'd have to repeat each decision tree several times (since the EventMap is a pure
-tree, not a general graph, it has no mechanism for pointers to be "shared").  
-The next few instances of the "SE" label are also part of this "quasi-tree" which 
+tree, not a general graph, it has no mechanism for pointers to be "shared").
+The next few instances of the "SE" label are also part of this "quasi-tree" which
 is initially splitting on the central phone (as we go down this file we are going
 deeper into the tree; notice that the braces "{" are opening but not yet closing).
 Then we have the string
 "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap
-on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.  
+on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.
 The values represent the five pdf-ids
 for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these
 three non-speech phones (only the transition matrix is specific to each non-speech phone).
@@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class (
 has value 0 (i.e. the leftmost HMM-state).  Assuming the answer is "yes", the next question
 is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
 forms of the phone "M" (a rather unintuitive question to ask, since we're
-in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is 
-a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if 
+in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
+a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
 no, 696 ("CE 696").
 \verbatim
 s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
@@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4
 \endverbatim
 
 Below is a simpler example: the monophone tree from the Resource Management
-recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").  
-The key "0" is the phone-position of zero which represents the central (and only) phone 
+recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").
+The key "0" is the phone-position of zero which represents the central (and only) phone
 since the context width (N) is 1.  The number of entries in the table is 49
 (in this case, the number of phones plus one).  The
 first EventMap in the table (index zero) is NULL, because there is no phone with
@@ -375,11 +386,11 @@ index zero.  The next one is a TableEventMap with three elements, corresponding
 to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )".
 \verbatim
 s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5
-ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) 
-TE -1 3 ( CE 3 CE 4 CE 5 ) 
-TE -1 3 ( CE 6 CE 7 CE 8 ) 
-TE -1 3 ( CE 9 CE 10 CE 11 ) 
-TE -1 3 ( CE 12 CE 13 CE 14 ) 
+ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 )
+TE -1 3 ( CE 3 CE 4 CE 5 )
+TE -1 3 ( CE 6 CE 7 CE 8 )
+TE -1 3 ( CE 9 CE 10 CE 11 )
+TE -1 3 ( CE 12 CE 13 CE 14 )
 \endverbatim
 
 
@@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols).  In the graph, as always,
 these are represented by integer labels.  We use an object that, in code
 and in filenames, is generally called ilabel_info.  The ilabel_info object
 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context.
-As with many other Kaldi types, ilabel_info is a generic (STL) type but 
-we use a consistent variable name 
+As with many other Kaldi types, ilabel_info is a generic (STL) type but
+we use a consistent variable name
 to make it identifiable.  It is of the following type:
 \code
  std::vector<std::vector<int32> > ilabel_info;
@@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above,
 \ref tree_window).  For example, suppose symbol 1500 is phone
 30 with a right-context of 12 and a left-context of 4, we would
 have
-\code 
+\code
  // not valid C++
  ilabel_info[1500] == { 4, 30, 12 };
 \endcode
@@ -410,14 +421,14 @@ In the monophone case, we would have things like:
 \code
  ilabel_info[30] == { 28 };
 \endcode
-There are special cases to deal with disambiguation symbols (see 
-\ref graph_disambig or the 
+There are special cases to deal with disambiguation symbols (see
+\ref graph_disambig or the
 Springer Handbook paper referenced above for an explanation of what these
 are).  If an ilabel_info entry corresponds to a disambiguation symbol,
 we put in it the negative of the symbol-table entry of the disambiguation
 symbol (note that this is not the same as the number of the printed form
-of the disambiguation symbol as in #0, #1, #2 etc., it is the number 
-corresponding to it in a symbol-table file, which in our current scripts is 
+of the disambiguation symbol as in #0, #1, #2 etc., it is the number
+corresponding to it in a symbol-table file, which in our current scripts is
 called phones_disambig.txt).  For example,
 \code
  ilabel_info[5] == { -42 };
@@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be
 given a list of disambiguation symbols in order to be able to distinguish them from
 real phones in the monophone case.  There are two additional special cases:
 we have
-\code 
+\code
  ilabel_info[0] == { }; // epsilon
  ilabel_info[1] == { 0 }; // disambig symbol #-1;
  // we use symbol 1, but don't consider this hardwired.
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index af1f7b1a346..5a0fb2a48fa 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -109,7 +109,7 @@ void Fbank::ComputeInternal(const VectorBase<BaseFloat> &wave,
 
   // Get dimensions of output features
   int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts);
-  int32 cols_out = opts_.mel_opts.num_bins + opts_.use_energy;
+  int32 cols_out = opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
   if (rows_out == 0) {
     output->Resize(0, 0);
     if (wave_remainder != NULL)
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 2a3819c5f62..febfeac9f9b 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -80,7 +80,9 @@ class Fbank {
   explicit Fbank(const FbankOptions &opts);
   ~Fbank();
 
-  int32 Dim() const { return opts_.mel_opts.num_bins; }
+  int32 Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
+  }
 
   /// Will throw exception on failure (e.g. if file too short for even one
   /// frame).  The output "wave_remainder" is the last frame or two of the
diff --git a/src/feat/signal.cc b/src/feat/signal.cc
index 19b876989c2..a374c531e3d 100644
--- a/src/feat/signal.cc
+++ b/src/feat/signal.cc
@@ -34,11 +34,11 @@ void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
 void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
   int32 signal_length = signal->Dim();
   int32 filter_length = filter.Dim();
-  Vector<float> signal_padded(signal_length + filter_length - 1);
+  Vector<BaseFloat> signal_padded(signal_length + filter_length - 1);
   signal_padded.SetZero();
   for (int32 i = 0; i < signal_length; i++) {
     for (int32 j = 0; j < filter_length; j++) {
-        signal_padded(i + j) += (*signal)(i) * filter(j);
+        signal_padded(i+j) += (*signal)(i) * filter(j);
     }
   }
   signal->CopyFromVec(signal_padded.Range(0, signal_length));
@@ -54,11 +54,11 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
 
   SplitRadixRealFft<BaseFloat> srfft(fft_length);
 
-  Vector<float> filter_padded(fft_length);
+  Vector<BaseFloat> filter_padded(fft_length);
   filter_padded.Range(0, filter_length).CopyFromVec(filter);
   srfft.Compute(filter_padded.Data(), true);
 
-  Vector<float> signal_padded(fft_length);
+  Vector<BaseFloat> signal_padded(fft_length);
   signal_padded.Range(0, signal_length).CopyFromVec(*signal);
   srfft.Compute(signal_padded.Data(), true);
 
@@ -70,7 +70,8 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
   signal->CopyFromVec(signal_padded.Range(0, signal_length));
 }
 
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal,
+  bool apply_inverse) {
   int32 signal_length = signal->Dim();
   int32 filter_length = filter.Dim();
 
@@ -83,13 +84,37 @@ void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFl
   KALDI_VLOG(1) << "Block size is " << block_length;
   SplitRadixRealFft<BaseFloat> srfft(fft_length);
 
-  Vector<float> filter_padded(fft_length);
+  Vector<BaseFloat> filter_padded(fft_length);
   filter_padded.Range(0, filter_length).CopyFromVec(filter);
   srfft.Compute(filter_padded.Data(), true);
+  
+  // If true, inverse of filter is computed and 
+  // input signal is convolved with inverse of filter.
+  // The inverse of filter H_inv(w) is estimated as 
+  // conj(H(w))/( abs(H(w))^2 + const) 
+  if (apply_inverse) {
+    BaseFloat abs_Hw, const_val = 0.0;
+    int32 half_N = filter_padded.Dim() / 2;
+    Vector<float> inv_filter_padded(filter_padded);
+    inv_filter_padded(0) = 
+      filter_padded(0) / (filter_padded(0) * filter_padded(0) + const_val);
+    inv_filter_padded(1) = 
+      filter_padded(1) / (filter_padded(1) * filter_padded(1) + const_val);
+    for (int32 bin = 1; bin < half_N; bin++) {
+      int32 w_real_ind = 2 * bin,
+       w_im_ind = 2 * bin + 1;
+      abs_Hw = filter_padded(w_real_ind) * filter_padded(w_real_ind) +
+               filter_padded(w_im_ind) * filter_padded(w_im_ind);
+
+      inv_filter_padded(w_real_ind) /= (abs_Hw + const_val);
+      inv_filter_padded(w_im_ind) *= -1.0 / (abs_Hw + const_val);
+    }
+    filter_padded.CopyFromVec(inv_filter_padded);
+  }
 
-  Vector<float> temp_pad(filter_length - 1);
+  Vector<BaseFloat> temp_pad(filter_length - 1);
   temp_pad.SetZero();
-  Vector<float> signal_block_padded(fft_length);
+  Vector<BaseFloat> signal_block_padded(fft_length);
 
   for (int32 po = 0; po < signal_length; po += block_length) {
     // get a block of the signal
diff --git a/src/feat/signal.h b/src/feat/signal.h
index 7ff0ce33b52..b9a49473b96 100644
--- a/src/feat/signal.h
+++ b/src/feat/signal.h
@@ -44,7 +44,8 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
    overlap-add method. This is an efficient way to evaluate the discrete
    convolution of a long signal with a finite impulse response filter.
 */
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal,
+  bool apply_inverse = false);
 
 }  // namespace kaldi
 
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 9843e7bbd4b..bff2b212a5b 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
     process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \
     compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \
     wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \
-    concat-feats
+    concat-feats compute-filter apply-filter
 
 OBJFILES =
 
diff --git a/src/featbin/apply-filter.cc b/src/featbin/apply-filter.cc
new file mode 100644
index 00000000000..8ddfd0073c2
--- /dev/null
+++ b/src/featbin/apply-filter.cc
@@ -0,0 +1,114 @@
+// featbin/apply-filters.cc
+
+// Copyright  2016 Pegah Ghahremani
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+void ApplyFilter(const Vector<BaseFloat> &input,
+                 const Vector<BaseFloat> &filter,
+                 Vector<BaseFloat> *filtered_input) {
+  int32 min_size = 0, 
+    size = input.Dim(), f_order = filter.Dim();
+  filtered_input->Resize(size);
+  // compute filtered input as y_j = sum_{i=1}^n x_(j-i) * a_i
+  // where input is y and filtered version is xi.
+  // sp x_j = 1/a_0 * (y_j - sum_{i=1}^p a_i * x(j-i))
+  (*filtered_input)(0) = input(0);
+  for (int32 i = 0; i < size; i++) {
+    min_size = std::min(f_order, i);
+    BaseFloat sum = 0;
+    for (int32 j = 1; j < min_size; j++) 
+      sum += filter(j) * (*filtered_input)(i-j);
+    KALDI_ASSERT(filter(0) != 0);
+    (*filtered_input)(i) = (input(i) - sum) / filter(0);
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+      "apply filter to wave files supplied via input pip as FIR or IIR filter.\n"
+      "If the --inverse=false, it applies filter as FIR filter\n"
+      "and if --inverse=true, the inverse of filter applies as IIR filter.\n"
+      "Usage: apply-filters [options...] <wav-in-rxfilename> "
+      " <spkfilter-rxfilename> <wav-out-wxfilename>\n"
+      "e.g. \n"
+      "apply-filters --inverse=false --utt2spkfilter=ark:data/train/utt2spkfilter \n"
+      " input.wav filter.wav output_1.wav\n";
+    ParseOptions po(usage);
+    
+    bool inverse = false;
+    std::string utt2spkfilter_rspecifier = "";
+    po.Register("inverse", &inverse,
+                "If false, the filter is applied as FIR filter,"
+                "otherwise its inverse applied as IIR filter.");
+    po.Register("utt2spkfilter", &utt2spkfilter_rspecifier,
+                "rspecifier for utterance to spkear-filter list map"
+                " used to filter each utterance");
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string input_wave_file = po.GetArg(2),
+      filter_file = po.GetArg(1),
+      output_wave_file = po.GetArg(3);
+
+    WaveData input_wave;
+    {
+      Input ki(input_wave_file);
+      input_wave.Read(ki.Stream());
+    
+    }
+
+    SequentialBaseFloatVectorReader filter_reader(filter_file);
+    const Vector<BaseFloat> &lpc_filter = filter_reader.Value();
+   
+    Vector<BaseFloat> filtered_wav(input_wave.Data().Row(0));
+    BaseFloat samp_freq_input = input_wave.SampFreq();
+    // If inverse = false, it does FFT-based block Convolution of filter with 
+    // long input signal.
+    // Otherwise inverse of filter is convolved with input signal.
+    // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter
+    // convolving input with this filter is like whitening transform.
+    // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} 
+    //   = conv(y, [1 :-lpc-coeffs])
+    Vector<BaseFloat> orig_wav(filtered_wav);
+    //if (inverse)
+    //  ApplyFilter(orig_wav, lpc_filter, &filtered_wav);
+    //else
+    FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse);
+    Matrix<BaseFloat> filtered_wav_mat(1, filtered_wav.Dim());
+    filtered_wav_mat.CopyRowsFromVec(filtered_wav);
+    WaveData out_wave(samp_freq_input, filtered_wav_mat); 
+    Output ko(output_wave_file, false);
+    out_wave.Write(ko.Stream());
+    return 0; 
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/featbin/compute-filter.cc b/src/featbin/compute-filter.cc
new file mode 100644
index 00000000000..4f750e418cf
--- /dev/null
+++ b/src/featbin/compute-filter.cc
@@ -0,0 +1,300 @@
+// featbin/compute-filters.cc
+
+// Copyright 2016 Pegah Ghahremani
+
+
+#include "feat/feature-functions.h"
+#include "matrix/srfft.h"
+#include "matrix/kaldi-matrix-inl.h"    
+#include "feat/wave-reader.h"
+namespace kaldi {
+
+// struct used to store statistics required for 
+// computing correlation coefficients 
+struct CorrelationStats {
+  int32 corr_order; // number of correlation coefficient. R[0],..,R[corr_order - 1] 
+  int32 num_samp;   // number of samples
+  BaseFloat samp_sum; // sum of samples.
+  Vector<BaseFloat> l2_norms; // l2_norms[j] - inner product of shifted input by itself as
+                             // sum_{i=0}^corr_window_size x[i+j]^2
+  Vector<BaseFloat> inner_prod; // inner product of input vector with its shifted version by j
+                                // sum_{i=0}^corr_window_size x[i] * x[i+j]
+  CorrelationStats(): corr_order(100), num_samp(0), samp_sum(0) { 
+    l2_norms.Resize(corr_order);
+    inner_prod.Resize(corr_order);}
+    
+  CorrelationStats(int32 corr_order): corr_order(corr_order), 
+    num_samp(0), samp_sum(0) {
+    l2_norms.Resize(corr_order);
+    inner_prod.Resize(corr_order); }
+};
+
+/*
+   This function computes and accumulates statistics 
+   required for computing auto-correlation coefficient using waveform "wave",
+   e.g dot-product of input with its shifted version.
+   inner_prod[j] - inner product of input vector with its shifted version by j
+                  sum_{i=0}^corr_window_size x[i] * x[i+j]
+   l2_norms[j]      - inner product of shifted input by itself as 
+                      sum_{i=0}^corr_window_size x[i+j]^2
+   lpc_order is the size of autocorr_coeffs.
+*/
+void AccStatsForCorrelation(const VectorBase<BaseFloat> &wave,
+                            int32 lpc_order,
+                            CorrelationStats *acc_corr_stats) { 
+  KALDI_ASSERT(acc_corr_stats->inner_prod.Dim() == lpc_order);
+  acc_corr_stats->samp_sum += wave.Sum();
+  acc_corr_stats->num_samp += wave.Dim();
+  int32 corr_window_size = wave.Dim() - lpc_order;
+  Vector<BaseFloat> norm_wave(wave);
+  SubVector<BaseFloat> sub_vec1(norm_wave, 0, corr_window_size);
+  BaseFloat local_l2_norm = VecVec(sub_vec1, sub_vec1), sum;
+
+  acc_corr_stats->inner_prod(0) += local_l2_norm;
+
+  for (int32 lag = 1; lag < lpc_order; lag++) {
+    SubVector<BaseFloat> sub_vec2(norm_wave, lag, corr_window_size);
+    int32 last_ind = corr_window_size + lag - 1;
+    local_l2_norm += (wave(last_ind) * wave(last_ind) - 
+      wave(lag - 1) * wave(lag - 1));
+    sum = VecVec(sub_vec1, sub_vec2);
+    acc_corr_stats->inner_prod(lag) += sum;
+    acc_corr_stats->l2_norms(lag) += local_l2_norm;
+  }
+}
+/*
+  Compute autocorrelation coefficients using accumulated unnormalized statistics
+  such as inner product and l2 norms.
+  inner product and l2_norms are normalized using mean as E[x].
+  autocorr[j] - sum_{i=0}^n ([x[i] - E[x]) * (x[i+j] - E(x)) /
+    [(sum_{i=0}^n ([x[i] - E[x])^2) * (sum_{i=0}^n ([x[i+j] - E[x])^2)]^0.5
+  autocorr[j] - inner_prod[j] / (norms[0] * norms[j])^0.5
+  inner_prod[j] - inner product of input vector with its shifted version by j
+                   sum_{i=0}^n x[i] * x[i+j]
+   l2_norms[j]      - inner product of shifted input by itself as sum_{i=0}^n x[i+j]^2
+*/
+void ComputeCorrelation(const CorrelationStats &acc_corr_stats,
+                        Vector<BaseFloat> *autocorr) {
+
+  KALDI_ASSERT(acc_corr_stats.inner_prod.Dim() == acc_corr_stats.l2_norms.Dim());
+
+  int32 lpc_order = acc_corr_stats.inner_prod.Dim();
+  autocorr->Resize(lpc_order);
+  for (int32 lag = 0; lag < lpc_order; lag++)
+    (*autocorr)(lag) = acc_corr_stats.inner_prod(lag);
+
+  // scale outocorrelation between 0 and 1 using autocorr(0)
+  autocorr->Scale(1.0 / (*autocorr)(0));
+  
+}
+/*
+   Durbin's recursion - converts autocorrelation coefficients to the LPC
+   pTmp - temporal place [n]
+   pAC - autocorrelation coefficients [n + 1]
+   pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
+*/
+double DurbinInternal(int32 n, double *pAC, double *pLP, double *pTmp) {
+  double ki;                // reflection coefficient
+
+  // we add this bias term to pAC[0].
+  // Adding bias term is equivalent to t = teoplitz(pAC) + diag(bias)
+  // which is like shifting eigenvalues of teoplitz(pAC) using bias term
+  // and we can make sure t is convertible.
+  double durbin_bias = 1e-2;
+  int32 max_repeats = 20;
+
+  double E = pAC[0];
+  pLP[0] = 1.0;
+  for (int32 i = 1; i <= n; i++) {
+    // next reflection coefficient
+    ki = pAC[i];
+    for (int32 j = 1; j < i; j++)
+      ki += pLP[j] * pAC[i - j];
+    ki = ki / E;
+
+    if (abs(ki) > 1) {
+      int32 num_repeats = int((pAC[0] - 1.0) / durbin_bias);
+      KALDI_LOG << " warning: In Durbin algorithm, abs(ki) > 1 "
+                << " for iteration = " << i 
+                << " ki = " << ki
+                << " autocorr[0] = " << pAC[0]
+                << " num_repeats = " << num_repeats
+                << "; the bias added";
+      pAC[0] += durbin_bias;
+      if (num_repeats < max_repeats)
+        return -1;
+    }
+    // new error
+    double c = 1 - ki * ki;
+    if (c < 1.0e-5) // remove NaNs for constan signal 
+      c = 1.0e-5;
+   
+    E *= c;
+    // new LP coefficients
+    pTmp[i] = -ki;
+    for (int32 j = 1; j < i; j++)
+      pTmp[j] = pLP[j] - ki * pLP[i - j];
+
+    for (int32 j = 1; j <= i; j++)
+      pLP[j] = pTmp[j];
+  }
+  return E;
+}
+/*
+  This function computes coefficients of forward linear predictor
+   w.r.t autocorrelation coefficients by minimizing the prediction
+   error using MSE.
+   Durbin used to compute LP coefficients using autocorrelation coefficients.
+   R(j) = sum_{i=1}^P R((i+j % p)] * a[i] j = 0, ..,P
+   P is order of Linear prediction
+   lp_filter - [1, -a[1], -a[2] ... ,-a[P]]
+   where a[i] are linear prediction coefficients. (predicted_x[n] = sum_{i=1}^P{a[i] * x[n-i]})
+   x[n] - predicted_x[n] = sum_{i = 0}^P { lp_filter[i] .* x[n] }
+                         = conv(x, lp_filter)
+   R(j) is the j_th autocorrelation coefficient.
+*/
+void ComputeFilters(const VectorBase<BaseFloat> &autocorr, 
+                    Vector<BaseFloat> *lp_filter) {
+  int32 n = autocorr.Dim();
+  lp_filter->Resize(n);
+  // compute lpc coefficients using autocorrelation coefficients
+  // with Durbin algorithm
+  Vector<double> d_autocorr(autocorr),
+   d_lpc_coeffs(n), d_tmp(n);
+
+  KALDI_LOG << "compute lpc using correlations ";
+  while (DurbinInternal(n, d_autocorr.Data(),
+                 d_lpc_coeffs.Data(),
+                 d_tmp.Data()) < 0);
+  lp_filter->CopyFromVec(d_lpc_coeffs);
+  if (KALDI_ISNAN(lp_filter->Sum())) {
+    KALDI_WARN << "NaN encountered in lpc coefficients derived from Durbin algorithm.";
+    lp_filter->Set(0.0);
+    (*lp_filter)(0) = 1.0;
+  }
+
+}
+
+} // namescape kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using kaldi::int32;
+
+    const char *usage = 
+      "Computes LP coefficients per-speaker, by minimizing "
+      "prediction error using MSE.\n"
+      "This coefficient contain speaker-dependent information correspond to each speaker.\n"
+
+      "Usage: compute-filters [options] <wave-rspecifier> <filter-rspecifier> \n"
+      "e.g.: compute-filters " 
+      " scp:data/train/wav.scp ark,scp:filter.ark,filter.scp\n";
+
+    ParseOptions po(usage);
+    std::string spk2utt_rspecifier;
+    bool binary = true; 
+    int32 channel = -1,
+      lpc_order = 100;
+    po.Register("binary", &binary, "write in binary mode (applies only to global filters)");
+    po.Register("lpc-order", &lpc_order, "number of LP coefficients used to extract filters.");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    int32 num_done = 0, num_err = 0;
+    std::string wav_rspecifier = po.GetArg(1),
+      wspecifier = po.GetArg(2);
+    
+    BaseFloatVectorWriter writer(wspecifier);
+    if (spk2utt_rspecifier != "") { 
+      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+      RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+        std::string spk = spk2utt_reader.Key();
+        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+        CorrelationStats acc_corr_stats(lpc_order);
+        for (size_t i = 0; i < uttlist.size(); i++) {
+          std::string utt = uttlist[i];
+          if (!wav_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find wave for utterance " << utt;
+            num_err++;
+            continue;
+          }
+          const WaveData &wav_data = wav_reader.Value(utt);
+          int32 num_chan = wav_data.Data().NumRows(), this_chan = channel;
+          KALDI_ASSERT(num_chan > 0);
+          if (channel == -1) { 
+            this_chan = 0;
+            if (num_chan != 1)
+              KALDI_WARN << "Channel not specified but you have data with "
+                         << num_chan  << " channels; defaulting to zero";
+          } else {
+            if (this_chan >= num_chan) {
+              KALDI_WARN << "File with id " << spk << " has "
+                         << num_chan << " channels but you specified channel "
+                         << channel << ", producing no output.";
+              num_err++;
+              continue;
+            }
+          }
+          Vector<BaseFloat> waveform(wav_data.Data().Row(this_chan));
+          waveform.Scale(1.0 / (1 << 15));
+          AccStatsForCorrelation(waveform, lpc_order,
+                                 &acc_corr_stats);
+        }
+        Vector<BaseFloat> filter, autocorr(lpc_order);
+        ComputeCorrelation(acc_corr_stats,
+                           &autocorr);
+        ComputeFilters(autocorr, &filter); 
+        writer.Write(spk, filter);
+        num_done++;
+      }  
+    } else { // assume the input waveform is per-speaker.
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string spk = wav_reader.Key();
+        const WaveData &wave_data = wav_reader.Value();
+        int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+        KALDI_ASSERT(num_chan > 0);
+        if (channel == -1) { 
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << spk << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            num_err++;
+            continue;
+          }
+        }
+        Vector<BaseFloat> waveform(wave_data.Data().Row(this_chan));
+        Vector<BaseFloat> autocorr, filter;
+        waveform.Scale(1.0 / (1 << 15));
+        KALDI_ASSERT(waveform.Max() <=1 && waveform.Min() >= -1);
+        CorrelationStats acc_corr_stats(lpc_order);
+
+        AccStatsForCorrelation(waveform, lpc_order,
+                               &acc_corr_stats);
+        ComputeCorrelation(acc_corr_stats,
+                           &autocorr);
+        //KALDI_LOG << "autocorr = " << autocorr;
+        ComputeFilters(autocorr, &filter);
+        writer.Write(spk, filter);
+        num_done++;
+      }
+    }
+    KALDI_LOG << "Done " << num_done << " speakers, " << num_err
+              << " with errors."; 
+    return (num_done != 0 ? 0 : 1); 
+  } catch(const std::exception &e) { 
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc
index 9219093bee1..96895f23cf4 100644
--- a/src/fstbin/fstaddselfloops.cc
+++ b/src/fstbin/fstaddselfloops.cc
@@ -45,8 +45,9 @@ int main(int argc, char *argv[]) {
         "on at least one arc out of the state.  Useful in conjunction with predeterminize\n"
         "\n"
         "Usage:  fstaddselfloops in-disambig-list out-disambig-list  [in.fst [out.fst] ]\n"
-        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n";
-
+        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n"
+        "in.list and out.list are lists of integers, one per line, of the\n"
+        "same length.\n";
 
     ParseOptions po(usage);
     po.Read(argc, argv);
@@ -62,12 +63,12 @@ int main(int argc, char *argv[]) {
         fst_out_filename = po.GetOptArg(4);
 
     VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_filename);
-    
+
     std::vector<int32> disambig_in;
     if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
                  << kaldi::PrintableRxfilename(disambig_in_rxfilename);
-    
+
     std::vector<int32> disambig_out;
     if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
@@ -81,7 +82,7 @@ int main(int argc, char *argv[]) {
     WriteFstKaldi(*fst, fst_out_filename);
 
     delete fst;
-    
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 25acf48a7d1..4e5cbd45282 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -429,18 +429,6 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
   }
 }
 
-// comparator object that can be used to sort from greatest to
-// least posterior.
-struct CompareReverseSecond {
-  // view this as an "<" operator used for sorting, except it behaves like
-  // a ">" operator on the .second field of the pair because we want the
-  // sort to be in reverse order (greatest to least) on posterior.
-  bool operator() (const std::pair<int32, BaseFloat> &a,
-                   const std::pair<int32, BaseFloat> &b) {
-    return (a.second > b.second);
-  }
-};
-
 BaseFloat VectorToPosteriorEntry(
     const VectorBase<BaseFloat> &log_likes,
     int32 num_gselect,
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index 18bbd65a86a..4f5896da7c6 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -155,6 +155,18 @@ int32 MergePosteriors(const Posterior &post1,
                       bool drop_frames,
                       Posterior *post);
 
+// comparator object that can be used to sort from greatest to
+// least posterior.
+struct CompareReverseSecond {
+  // view this as an "<" operator used for sorting, except it behaves like
+  // a ">" operator on the .second field of the pair because we want the
+  // sort to be in reverse order (greatest to least) on posterior.
+  bool operator() (const std::pair<int32, BaseFloat> &a,
+                   const std::pair<int32, BaseFloat> &b) {
+    return (a.second > b.second);
+  }
+};
+
 /// Given a vector of log-likelihoods (typically of Gaussians in a GMM
 /// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior
 /// 0 <= min_post < 1, it gets the posterior for each element of log-likes
diff --git a/src/ivector/plda.h b/src/ivector/plda.h
index f5affa5d1ae..57609633169 100644
--- a/src/ivector/plda.h
+++ b/src/ivector/plda.h
@@ -73,8 +73,8 @@ class Plda {
   /// before giving them to the function LogLikelihoodRatio (it's
   /// done this way for efficiency because a given iVector may be
   /// used multiple times in LogLikelihoodRatio and we don't want
-  /// do repeat the matrix multiplication
-  /// 
+  /// to repeat the matrix multiplication
+  ///
   /// If config.normalize_length == true, it will also normalize the length of
   /// the iVector so that it is equal to the sqrt(dim).  The normalization
   /// factor is returned, even if config.normalize_length == false, in which
@@ -88,7 +88,7 @@ class Plda {
   float TransformIvector(const PldaConfig &config,
                          const VectorBase<float> &ivector,
                          VectorBase<float> *transformed_ivector) const;
-  
+
   /// Returns the log-likelihood ratio
   /// log (p(test_ivector | same) / p(test_ivector | different)).
   /// transformed_train_ivector is an average over utterances for
@@ -100,7 +100,7 @@ class Plda {
                             int32 num_train_utts,
                             const VectorBase<double> &transformed_test_ivector);
 
-  
+
   /// This function smooths the within-class covariance by adding to it,
   /// smoothing_factor (e.g. 0.1) times the between-class covariance (it's
   /// implemented by modifying transform_).  This is to compensate for
@@ -108,7 +108,7 @@ class Plda {
   /// estimate of the within-class covariance, and where the leading elements of
   /// psi_ were as a result very large.
   void SmoothWithinClassCovariance(double smoothing_factor);
-  
+
   int32 Dim() const { return mean_.Dim(); }
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
@@ -116,7 +116,7 @@ class Plda {
   void ComputeDerivedVars(); // computes offset_.
   friend class PldaEstimator;
   friend class PldaUnsupervisedAdaptor;
-  
+
   Vector<double> mean_;  // mean of samples in original space.
   Matrix<double> transform_; // of dimension Dim() by Dim();
                              // this transform makes within-class covar unit
@@ -142,7 +142,7 @@ class PldaStats {
   /// to weight your training samples.
   void AddSamples(double weight,
                   const Matrix<double> &group);
-    
+
   int32 Dim() const { return dim_; }
 
   void Init(int32 dim);
@@ -151,9 +151,9 @@ class PldaStats {
   bool IsSorted() const;
   ~PldaStats();
  protected:
-  
+
   friend class PldaEstimator;
-  
+
   int32 dim_;
   int64 num_classes_;
   int64 num_examples_; // total number of examples, sumed over classes.
@@ -165,7 +165,7 @@ class PldaStats {
 
   SpMatrix<double> offset_scatter_; // Sum over all examples, of the weight
                                     // times (example - class-mean).
-  
+
   // We have one of these objects per class.
   struct ClassInfo {
     double weight;
@@ -178,7 +178,7 @@ class PldaStats {
     ClassInfo(double weight, Vector<double> *mean, int32 num_examples):
         weight(weight), mean(mean), num_examples(num_examples) { }
   };
-   
+
   std::vector<ClassInfo> class_info_;
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(PldaStats);
@@ -197,16 +197,16 @@ struct PldaEstimationConfig {
 class PldaEstimator {
  public:
   PldaEstimator(const PldaStats &stats);
-  
+
   void Estimate(const PldaEstimationConfig &config,
                 Plda *output);
 private:
   typedef PldaStats::ClassInfo ClassInfo;
-  
+
   /// Returns the part of the objf relating to
   /// offsets from the class means.  (total, not normalized)
   double ComputeObjfPart1() const;
-  
+
   /// Returns the part of the obj relating to
   /// the class means (total_not normalized)
   double ComputeObjfPart2() const;
@@ -217,7 +217,7 @@ class PldaEstimator {
   int32 Dim() const { return stats_.Dim(); }
 
   void EstimateOneIter();
-  
+
   void InitParameters();
 
   void ResetPerIterStats();
@@ -233,7 +233,7 @@ class PldaEstimator {
 
   // Copy to output.
   void GetOutput(Plda *plda);
-  
+
   const PldaStats &stats_;
 
   SpMatrix<double> within_var_;
@@ -254,7 +254,7 @@ struct PldaUnsupervisedAdaptorConfig {
   BaseFloat mean_diff_scale;
   BaseFloat within_covar_scale;
   BaseFloat between_covar_scale;
-  
+
   PldaUnsupervisedAdaptorConfig():
       mean_diff_scale(1.0),
       within_covar_scale(0.3),
@@ -285,7 +285,7 @@ class PldaUnsupervisedAdaptor {
   // Add stats to this class.  Normally the weight will be 1.0.
   void AddStats(double weight, const Vector<double> &ivector);
   void AddStats(double weight, const Vector<float> &ivector);
-  
+
 
   void UpdatePlda(const PldaUnsupervisedAdaptorConfig &config,
                   Plda *plda) const;
@@ -293,7 +293,7 @@ class PldaUnsupervisedAdaptor {
 
   double tot_weight_;
   Vector<double> mean_stats_;
-  SpMatrix<double> variance_stats_;    
+  SpMatrix<double> variance_stats_;
 };
 
 
diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc
index 1b7476723c0..c7c8e484f8d 100644
--- a/src/kwsbin/compute-atwv.cc
+++ b/src/kwsbin/compute-atwv.cc
@@ -37,13 +37,34 @@ int main(int argc, char *argv[]) {
 
     const char *usage = "Computes the Actual Term-Weighted Value and prints it."
         "\n"
-        "Usage: compute-atwv [options]  ref-rspecifier hyp-rspecifier [alignment csv]\n"
-        " e.g.: compute-atwv ark:ref.1 ark:hyp.1 ali.csv\n"
+        "Usage: compute-atwv [options] <nof-trials> <ref-rspecifier> <hyp-rspecifier> [alignment-csv-filename]\n"
+        " e.g.: compute-atwv 32485.4 ark:ref.1 ark:hyp.1 ali.csv\n"
+        "   or: compute-atwv 32485.4 ark:ref.1 ark:hyp.1\n"
         "\n"
-        "where the alignment format is compatible with the alignment produced\n"
-        "using the F4DE tool -- you are responsible for mapping the utterance\n"
-        "identifiers and the term string to the correct ones - use the script\n"
-        "utils/int2sym.pl and the utterance/keyword maps\n";
+        "NOTES: \n"
+        "  a) the number of trials is usually equal to the size of the searched\n"
+        "     collection in seconds\n"
+        "  b  the ref-rspecifier/hyp-rspecifier are the kaldi IO specifiers for both\n"
+        "     the reference and the hypotheses (found hits), respectively.\n"
+        "     The format is the same for both of them. Each line is of \n"
+        "     the following format\n"
+        "\n"
+        "     <KW-ID> <utterance-id> <start-frame> <end-frame> <score>\n\n"
+        "     e.g.:\n\n"
+        "     KW106-189 348 459 560 0.8\n"
+        "\n"
+        "  b) the alignment-csv-filename is an optional parameter. If present,\n"
+        "     the alignment i.e. detailed information about what hypotheses match\n"
+        "     up with which reference entries will be generated. The alignemnt\n"
+        "     file format is equivalent to the alignment file produced using\n"
+        "     the F4DE tool. However, we do not set some fields and the utterance\n"
+        "     identifiers are numeric. You can use the script utils/int2sym.pl\n"
+        "     and the utterance/keyword maps to convert the numerical ids into text\n"
+        "  c) the scores are expected to be probabilities. Please note that\n"
+        "     the output from the kws-search is in -log(probability).\n"
+        "  d) compute-atwv does not perform any score normalization (it's just\n"
+        "     for scoring purposes). Without score normalization/calibration\n"
+        "     the performance of the search will be quite poor.\n";
 
     ParseOptions po(usage);
     KwsTermsAlignerOptions ali_opts;
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index 8e92f939ef9..e38c62b3bfa 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -19,17 +19,6 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef _MSC_VER
-#include <unordered_map>
-using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 #include <vector>
 #include <climits>
 #include "fstext/determinize-lattice.h" // for LatticeStringRepository
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 0ea66712eda..d8443bd7434 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -405,15 +405,11 @@ static inline double LogAddOrMax(bool viterbi, double a, double b) {
     return LogAdd(a, b);
 }
 
-// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
-// best-path negated cost) Note: in either case, the alphas and betas are
-// negated costs.  Requires that lat be topologically sorted.  This code
-// will work for either CompactLattice or Latice.
 template<typename LatticeType>
-static double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
-                                           bool viterbi,
-                                           vector<double> *alpha,
-                                           vector<double> *beta) {
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta) {
   typedef typename LatticeType::Arc Arc;
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
@@ -462,6 +458,19 @@ static double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
   return 0.5 * (tot_backward_prob + tot_forward_prob);
 }
 
+// instantiate the template for Lattice and CompactLattice
+template 
+double ComputeLatticeAlphasAndBetas(const Lattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+template 
+double ComputeLatticeAlphasAndBetas(const CompactLattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
 
 
 /// This is used in CompactLatticeLimitDepth.
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index 505aaffbe55..c58b2ec32b8 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -45,7 +45,7 @@ int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);
 
 /// As LatticeStateTimes, but in the CompactLattice format.  Note: must
 /// be topologically sorted.  Returns length of the utterance in frames, which
-/// may not be the same as the maximum time in the lattice, due to frames
+/// might not be the same as the maximum time in the lattice, due to frames
 /// in the final-prob.
 int32 CompactLatticeStateTimes(const CompactLattice &clat,
                                std::vector<int32> *times);
@@ -64,7 +64,7 @@ BaseFloat LatticeForwardBackward(const Lattice &lat,
                                  double *acoustic_like_sum = NULL);
 
 // This function is something similar to LatticeForwardBackward(), but it is on
-// the CompactLattice lattice format. Also we only need the alpha in the forward 
+// the CompactLattice lattice format. Also we only need the alpha in the forward
 // path, not the posteriors.
 bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
                                  vector<double> *alpha);
@@ -74,6 +74,18 @@ bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
 bool ComputeCompactLatticeBetas(const CompactLattice &lat,
                                 vector<double> *beta);
 
+
+// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
+// best-path negated cost) Note: in either case, the alphas and betas are
+// negated costs.  Requires that lat be topologically sorted.  This code
+// will work for either CompactLattice or Latice.
+template<typename LatticeType>
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+
 /// Topologically sort the compact lattice if not already topologically sorted.
 /// Will crash if the lattice cannot be topologically sorted.
 void TopSortCompactLatticeIfNeeded(CompactLattice *clat);
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index f1633978fbf..74bf664b6c6 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -20,7 +20,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-minimize lattice-limit-depth lattice-depth-per-frame \
            lattice-confidence lattice-determinize-phone-pruned \
            lattice-determinize-phone-pruned-parallel lattice-expand-ngram \
-           lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons
+           lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
+           lattice-arc-post lattice-determinize-non-compact
 
 OBJFILES =
 
@@ -30,7 +31,7 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a 
+          ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
new file mode 100644
index 00000000000..38a5d6d304d
--- /dev/null
+++ b/src/latbin/lattice-arc-post.cc
@@ -0,0 +1,214 @@
+// latbin/lattice-arc-post.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+
+// This class computes and outputs
+// the information about arc posteriors.
+
+class ArcPosteriorComputer {
+ public:
+  // Note: 'clat' must be topologically sorted.
+  ArcPosteriorComputer(const CompactLattice &clat,
+                       BaseFloat min_post,
+                       bool print_alignment,
+                       const TransitionModel *trans_model = NULL):
+      clat_(clat), min_post_(min_post), print_alignment_(print_alignment),
+      trans_model_(trans_model) { }
+
+  // returns the number of arc posteriors that it output.
+  int32 OutputPosteriors(const std::string &utterance,
+                         std::ostream &os) {
+    int32 num_post = 0;
+    if (!ComputeCompactLatticeAlphas(clat_, &alpha_))
+      return num_post;
+    if (!ComputeCompactLatticeBetas(clat_, &beta_))
+      return num_post;
+
+    CompactLatticeStateTimes(clat_, &state_times_);
+    if (clat_.Start() < 0)
+      return 0;
+    double tot_like = beta_[clat_.Start()];
+
+    int32 num_states = clat_.NumStates();
+    for (int32 state = 0; state < num_states; state++) {
+      for (fst::ArcIterator<CompactLattice> aiter(clat_, state);
+           !aiter.Done(); aiter.Next()) {
+        const CompactLatticeArc &arc = aiter.Value();
+        double arc_loglike = -ConvertToCost(arc.weight) +
+            alpha_[state] + beta_[arc.nextstate] - tot_like;
+        KALDI_ASSERT(arc_loglike < 0.1 &&
+                     "Bad arc posterior in forward-backward computation");
+        if (arc_loglike > 0.0) arc_loglike = 0.0;
+        int32 num_frames = arc.weight.String().size(),
+            word = arc.ilabel;
+        BaseFloat arc_post = exp(arc_loglike);
+        if (arc_post <= min_post_) continue;
+        os << utterance << '\t' << state_times_[state] << '\t' << num_frames
+           << '\t' << arc_post << '\t' << word;
+        if (print_alignment_) {
+          os << '\t';
+          const std::vector<int32> &ali = arc.weight.String();
+          for (int32 frame = 0; frame < num_frames; frame++) {
+            os << ali[frame];
+            if (frame + 1 < num_frames) os << ',';
+          }
+        }
+        if (trans_model_ != NULL) {
+          // we want to print the phone sequence too.
+          os << '\t';
+          const std::vector<int32> &ali = arc.weight.String();
+          bool first_phone = true;
+          for (int32 frame = 0; frame < num_frames; frame++) {
+            if (trans_model_->IsFinal(ali[frame])) {
+              if (first_phone) first_phone = false;
+              else os << ' ';
+              os << trans_model_->TransitionIdToPhone(ali[frame]);
+            }
+          }
+        }
+        os << std::endl;
+        num_post++;
+      }
+    }
+    return num_post;
+  }
+ private:
+  const CompactLattice &clat_;
+  std::vector<double> alpha_;
+  std::vector<double> beta_;
+  std::vector<int32> state_times_;
+
+  BaseFloat min_post_;
+  bool print_alignment_;
+  const TransitionModel *trans_model_;
+};
+
+}
+
+
+int main(int argc, char *argv[]) {
+  try {
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Print out information regarding posteriors of lattice arcs\n"
+        "This program computes posteriors from a lattice and prints out\n"
+        "information for each arc (the format is reminiscent of ctm, but\n"
+        "contains information from multiple paths).  Each line is:\n"
+        " <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] [<phone1> <phone2>...]\n"
+        "for instance:\n"
+        "2013a04-bk42\t104\t26\t0.95\t0\t11,242,242,242,71,894,894,62,63,63,63,63\t2 8 9\n"
+        "where the --print-alignment option determines whether the alignments (i.e. the\n"
+        "sequences of transition-ids) are printed, and the phones are printed only if the\n"
+        "<model> is supplied on the command line.  Note, there are tabs between the major\n"
+        "fields, but the phones are separated by spaces.\n"
+        "Usage: lattice-arc-post [<model>] <lattices-rspecifier> <output-wxfilename>\n"
+        "e.g.: lattice-arc-post --acoustic-scale=0.1 final.mdl 'ark:gunzip -c lat.1.gz|' post.txt\n"
+        "You will probably want to word-align the lattices (e.g. lattice-align-words or\n"
+        "lattice-align-words-lexicon) before this program, apply an acoustic scale either\n"
+        "via the --acoustic-scale option or using lattice-scale.\n"
+        "See also: lattice-post, lattice-to-ctm-conf, nbest-to-ctm\n";
+
+    kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
+    kaldi::BaseFloat min_post = 0.0001;
+    bool print_alignment = false;
+
+    kaldi::ParseOptions po(usage);
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("lm-scale", &lm_scale,
+                "Scaling factor for \"graph costs\" (including LM costs)");
+    po.Register("print-alignment", &print_alignment,
+                "If true, print alignments (i.e. sequences of transition-ids) for each\n"
+                "arc.");
+    po.Register("min-post", &min_post,
+                "Arc posteriors below this value will be pruned away");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (acoustic_scale == 0.0)
+      KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
+
+    kaldi::TransitionModel trans_model;
+
+    std::string lats_rspecifier, output_wxfilename;
+    if (po.NumArgs() == 3) {
+      ReadKaldiObject(po.GetArg(1), &trans_model);
+      lats_rspecifier = po.GetArg(2);
+      output_wxfilename = po.GetArg(3);
+    } else {
+      lats_rspecifier = po.GetArg(1);
+      output_wxfilename = po.GetArg(2);
+    }
+
+
+    kaldi::Output output(output_wxfilename, false);
+
+    // Read as regular lattice
+    kaldi::SequentialCompactLatticeReader clat_reader(lats_rspecifier);
+
+    int64 tot_post = 0;
+    int32 num_lat_done = 0, num_lat_err = 0;
+
+    for (; !clat_reader.Done(); clat_reader.Next()) {
+      std::string key = clat_reader.Key();
+      kaldi::CompactLattice clat = clat_reader.Value();
+      // FreeCurrent() is an optimization that prevents the lattice from being
+      // copied unnecessarily (OpenFst does copy-on-write).
+      clat_reader.FreeCurrent();
+      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &clat);
+      kaldi::TopSortCompactLatticeIfNeeded(&clat);
+
+      kaldi::ArcPosteriorComputer computer(
+          clat, min_post, print_alignment,
+          (po.NumArgs() == 3 ? &trans_model : NULL));
+
+      int32 num_post = computer.OutputPosteriors(key, output.Stream());
+      if (num_post != 0) {
+        num_lat_done++;
+        tot_post += num_post;
+      } else {
+        num_lat_err++;
+        KALDI_WARN << "No posterior printed for " << key;
+      }
+    }
+    KALDI_LOG << "Printed posteriors for " << num_lat_done << " lattices ("
+              << num_lat_err << " with errors); on average printed "
+              << (tot_post / (num_lat_done == 0 ? 1 : num_lat_done))
+              << " posteriors per lattice.";
+    return (num_lat_done > 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc
index dda41cd0604..dc25fb351c6 100644
--- a/src/latbin/lattice-best-path.cc
+++ b/src/latbin/lattice-best-path.cc
@@ -121,7 +121,7 @@ int main(int argc, char *argv[]) {
     }
 
     BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2();
-    KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame)
+    KALDI_LOG << "Overall cost per frame is " << (tot_weight_float/n_frame)
               << " = " << (tot_weight.Value1()/n_frame) << " [graph]"
               << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]"
               << " over " << n_frame << " frames.";
diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc
index 76ca034b2e4..f66eb699705 100644
--- a/src/latbin/lattice-copy.cc
+++ b/src/latbin/lattice-copy.cc
@@ -24,6 +24,108 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 
+namespace kaldi {
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialLatticeReader *lattice_reader,
+      LatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << "Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialCompactLatticeReader *lattice_reader,
+      CompactLatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+    
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << " Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -36,14 +138,32 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Copy lattices (e.g. useful for changing to text mode or changing\n"
         "format to standard from compact lattice.)\n"
+        "The --include and --exclude options can be used to copy only a subset "
+        "of lattices, where are the --include option specifies the "
+        "whitelisted utterances that would be copied and --exclude option "
+        "specifies the blacklisted utterances that would not be copied.\n"
+        "Only one of --include and --exclude can be supplied.\n"
         "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n"
         "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n";
     
     ParseOptions po(usage);
-    bool write_compact = true;
+    bool write_compact = true, ignore_missing = false;
+    std::string include_rxfilename;
+    std::string exclude_rxfilename;
+
     po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
-    
+    po.Register("include", &include_rxfilename, 
+                "Text file, the first field of each "
+                "line being interpreted as the "
+                "utterance-id whose lattices will be included");
+    po.Register("exclude", &exclude_rxfilename, 
+                "Text file, the first field of each "
+                "line being interpreted as an utterance-id "
+                "whose lattices will be excluded");
+    po.Register("ignore-missing", &ignore_missing,
+                "Exit with status 0 even if no lattices are copied");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -59,15 +179,46 @@ int main(int argc, char *argv[]) {
     if (write_compact) {
       SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
       CompactLatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,  
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename, 
+            &lattice_reader, &lattice_writer,
+            false, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     } else {
       SequentialLatticeReader lattice_reader(lats_rspecifier);
       LatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     }
     KALDI_LOG << "Done copying " << n_done << " lattices.";
+    
+    if (ignore_missing) return 0;
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
new file mode 100644
index 00000000000..8665fcb58d1
--- /dev/null
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -0,0 +1,317 @@
+// latbin/lattice-determinize-non-compact.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "util/stl-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lat/push-lattice.h"
+#include "lat/minimize-lattice.h"
+
+namespace kaldi {
+
+typedef Lattice::StateId StateId;
+typedef Lattice::Arc Arc;
+
+// This function is a copy of the function in the program lattice-determinize
+bool DeterminizeLatticeWrapper(const Lattice &lat,
+                               const std::string &key,
+                               bool prune,
+                               BaseFloat beam,
+                               BaseFloat beam_ratio,
+                               int32 max_mem,
+                               int32 max_loop,
+                               BaseFloat delta,
+                               int32 num_loops,
+                               CompactLattice *clat) {
+  fst::DeterminizeLatticeOptions lat_opts;
+  lat_opts.max_mem = max_mem;
+  lat_opts.max_loop = max_loop;
+  lat_opts.delta = delta;
+  BaseFloat cur_beam = beam;
+  for (int32 i = 0; i < num_loops;) { // we increment i below.
+
+    if (lat.Start() == fst::kNoStateId) {
+      KALDI_WARN << "Detected empty lattice, skipping " << key;
+      return false;
+    }
+    
+    // The work gets done in the next line.  
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+      if (prune) PruneLattice(cur_beam, clat);
+      return true;
+    } else { // failed to determinize..
+      KALDI_WARN << "Failed to determinize lattice (presumably max-states "
+                 << "reached), reducing lattice-beam to "
+                 << (cur_beam*beam_ratio) << " and re-trying.";
+      for (; i < num_loops; i++) {
+        cur_beam *= beam_ratio;
+        Lattice pruned_lat(lat);
+        PruneLattice(cur_beam, &pruned_lat);
+        if (NumArcs(lat) == NumArcs(pruned_lat)) {
+          cur_beam *= beam_ratio;
+          KALDI_WARN << "Pruning did not have an effect on the original "
+                     << "lattice size; reducing beam to "
+                     << cur_beam << " and re-trying.";
+        } else if (DeterminizeLattice(pruned_lat, clat, lat_opts, NULL)) {
+          if (prune) PruneLattice(cur_beam, clat);
+          return true;
+        } else {
+          KALDI_WARN << "Determinization failed again; reducing beam again to "
+                     << (cur_beam*beam_ratio) << " and re-trying.";
+        }
+      }
+    }
+  }
+  KALDI_WARN << "Decreased pruning beam --num-loops=" << num_loops
+             << " times and was not able to determinize: failed for "
+             << key;
+  return false;
+}
+
+void ComputeAcousticScoresMap(
+    const Lattice &lat, 
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+                                        PairHasher<int32> > *acoustic_scores) {
+  acoustic_scores->clear();
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);
+  
+  KALDI_ASSERT(lat.Start() == 0);
+
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
+          aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      const LatticeWeight &weight = arc.weight;
+
+      int32 tid = arc.ilabel;
+
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+          PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
+        if (it == acoustic_scores->end()) {
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), 
+                                          std::make_pair(weight.Value2(), 1)));
+        } else {
+          if (it->second.second == 2 
+                && it->second.first / it->second.second != weight.Value2()) {
+            KALDI_VLOG(2) << "Transitions on the same frame have different "
+                          << "acoustic costs for tid " << tid << "; " 
+                          << it->second.first / it->second.second 
+                          << " vs " << weight.Value2();
+          }
+          it->second.first += weight.Value2();
+          it->second.second++;
+        }
+      } else {
+        // Arcs with epsilon input label (tid) must have 0 acoustic cost
+        KALDI_ASSERT(weight.Value2() == 0);
+      }
+    }
+
+    LatticeWeight f = lat.Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Final acoustic cost must be 0 as we are reading from 
+      // non-determinized, non-compact lattice
+      KALDI_ASSERT(f.Value2() == 0.0);
+    }
+  }
+}
+
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat) {
+  fst::TopSort(lat);
+  
+  std::vector<int32> state_times;
+  LatticeStateTimes(*lat, &state_times);
+  
+  KALDI_ASSERT(lat->Start() == 0);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s); 
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+ 
+      int32 tid = arc.ilabel;
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+          PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
+        if (it == acoustic_scores.end()) {
+          KALDI_ERR << "Could not find tid " << tid << " at time " << t
+                    << " in the acoustic scores map.";
+        } else {
+          arc.weight.SetValue2(it->second.first / it->second.second);
+        }
+      } else {
+        // For epsilon arcs, set acoustic cost to 0.0
+        arc.weight.SetValue2(0.0);
+      }
+      aiter.SetValue(arc);
+    }
+
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Set final acoustic cost to 0.0
+      f.SetValue2(0.0);
+      lat->SetFinal(s, f);
+    }
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "lattice-determinize lattices (and apply a pruning beam)\n"
+        " (see http://kaldi.sourceforge.net/lattices.html for more explanation)\n"
+        "This version of the program retains the original "
+        "acoustic scores of arcs in the determinized lattice and writes it "
+        "as a normal (non-compact) lattice. \n"
+        " note: this program is tyically only useful if you generated state-level\n"
+        " lattices, e.g. called gmm-latgen-simple with --determinize=false\n"
+        "\n"
+        "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n"
+        " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
+      
+    ParseOptions po(usage);
+    BaseFloat acoustic_scale = 1.0;
+    BaseFloat beam = 10.0;
+    BaseFloat beam_ratio = 0.9;
+    int32 num_loops = 20;
+    int32 max_mem = 50000000; // 50 MB
+    int32 max_loop = 500000;
+    BaseFloat delta = fst::kDelta;
+    bool prune = false;
+    bool minimize = false;
+    
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("beam", &beam,
+                "Pruning beam [applied after acoustic scaling]-- also used "
+                "to handle determinization failures, set --prune=false to "
+                "disable routine pruning");
+    po.Register("delta", &delta, "Tolerance used in determinization");
+    po.Register("prune", &prune, "If true, prune determinized lattices "
+                "with the --beam option.");
+    po.Register("max-mem", &max_mem, "Maximum approximate memory usage in "
+                "determinization (real usage might be many times this)");
+    po.Register("max-loop", &max_loop, "Option to detect a certain "
+                "type of failure in lattice determinization (not critical)");
+    po.Register("beam-ratio", &beam_ratio, "Ratio by which to "
+                "decrease beam if we reach the max-arcs.");
+    po.Register("num-loops", &num_loops, "Number of times to "
+                "decrease beam by beam-ratio if determinization fails.");
+    po.Register("minimize", &minimize,
+                "If true, push and minimize after determinization");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier = po.GetArg(1),
+        lats_wspecifier = po.GetArg(2);
+
+    // Read as regular lattice-- this is the form we need it in for efficient
+    // pruning.
+    SequentialLatticeReader lattice_reader(lats_rspecifier);
+    
+    // Write as regular lattice.
+    LatticeWriter lattice_writer(lats_wspecifier); 
+
+    int32 n_done = 0, n_error = 0;
+
+    if (acoustic_scale == 0.0)
+      KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
+    LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
+
+    for (; !lattice_reader.Done(); lattice_reader.Next()) {
+      std::string key = lattice_reader.Key();
+      Lattice lat = lattice_reader.Value();
+      
+      lattice_reader.FreeCurrent();
+      
+      fst::TopSort(&lat);
+      
+      fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
+
+
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) 
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>, 
+                                          PairHasher<int32> > acoustic_scores;
+      ComputeAcousticScoresMap(lat, &acoustic_scores);
+      
+      Invert(&lat); // make it so word labels are on the input.
+      
+      CompactLattice clat;
+      if (DeterminizeLatticeWrapper(lat, key, prune,
+                                    beam, beam_ratio, max_mem, max_loop,
+                                    delta, num_loops, &clat)) {
+        if (minimize) {
+          PushCompactLatticeStrings(&clat);
+          PushCompactLatticeWeights(&clat);
+          MinimizeCompactLattice(&clat);
+        }
+
+        Lattice out_lat;
+        fst::ConvertLattice(clat, &out_lat);
+        fst::TopSort(&out_lat);
+
+        // Replace each arc (t, tid) with the averaged acoustic score from
+        // the computed map
+        ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
+
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), 
+                          &out_lat);
+        lattice_writer.Write(key, out_lat);
+        n_done++;
+      } else {
+        n_error++; // will have already printed warning.
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/latbin/lattice-to-ctm-conf.cc b/src/latbin/lattice-to-ctm-conf.cc
index 5489e560be8..56ea983ac9b 100644
--- a/src/latbin/lattice-to-ctm-conf.cc
+++ b/src/latbin/lattice-to-ctm-conf.cc
@@ -51,8 +51,8 @@ int main(int argc, char *argv[]) {
         " e.g.: lattice-to-ctm-conf --acoustic-scale=0.1 ark:1.lats 1.ctm\n"
         "   or: lattice-to-ctm-conf --acoustic-scale=0.1 --decode-mbr=false\\\n"
         "                                      ark:1.lats ark:1.1best 1.ctm\n"
-        "See also: lattice-mbr-decode, nbest-to-ctm, steps/get_ctm.sh,\n"
-        "          steps/get_train_ctm.sh and utils/convert_ctm.sh.\n";
+        "See also: lattice-mbr-decode, nbest-to-ctm, lattice-arc-post,\n"
+        " steps/get_ctm.sh, steps/get_train_ctm.sh and utils/convert_ctm.sh.\n";
 
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0, inv_acoustic_scale = 1.0, lm_scale = 1.0;
@@ -69,7 +69,7 @@ int main(int argc, char *argv[]) {
     po.Register("decode-mbr", &decode_mbr, "If true, do Minimum Bayes Risk "
                 "decoding (else, Maximum a Posteriori)");
     po.Register("frame-shift", &frame_shift, "Time in seconds between frames.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2 && po.NumArgs() != 3) {
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
     KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0);
     if (inv_acoustic_scale != 1.0)
       acoustic_scale = 1.0 / inv_acoustic_scale;
-    
+
     std::string lats_rspecifier, one_best_rspecifier, ctm_wxfilename;
 
     if (po.NumArgs() == 2) {
@@ -92,9 +92,9 @@ int main(int argc, char *argv[]) {
       one_best_rspecifier = po.GetArg(2);
       ctm_wxfilename = po.GetArg(3);
     }
-    
+
     // Ensure the output ctm file is not a wspecifier
-    WspecifierType ctm_wx_type; 
+    WspecifierType ctm_wx_type;
     ctm_wx_type  = ClassifyWspecifier(ctm_wxfilename, NULL, NULL, NULL);
     if(ctm_wx_type != kNoWspecifier){
         KALDI_ERR << "The output ctm file should not be a wspecifier. "
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
 
     // Read as compact lattice.
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
- 
+
     RandomAccessInt32VectorReader one_best_reader(one_best_rspecifier);
 
     Output ko(ctm_wxfilename, false); // false == non-binary writing mode.
@@ -114,7 +114,7 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0, n_words = 0;
     BaseFloat tot_bayes_risk = 0.0;
-    
+
     for (; !clat_reader.Done(); clat_reader.Next()) {
       std::string key = clat_reader.Key();
       CompactLattice clat = clat_reader.Value();
@@ -133,7 +133,7 @@ int main(int argc, char *argv[]) {
         const std::vector<int32> &one_best = one_best_reader.Value(key);
         mbr = new MinimumBayesRisk(clat, one_best, decode_mbr);
       }
-      
+
       const std::vector<BaseFloat> &conf = mbr->GetOneBestConfidences();
       const std::vector<int32> &words = mbr->GetOneBest();
       const std::vector<std::pair<BaseFloat, BaseFloat> > &times =
@@ -146,7 +146,7 @@ int main(int argc, char *argv[]) {
                     << words[i] << ' ' << conf[i] << '\n';
       }
       KALDI_LOG << "For utterance " << key << ", Bayes Risk "
-                << mbr->GetBayesRisk() << ", avg. confidence per-word " 
+                << mbr->GetBayesRisk() << ", avg. confidence per-word "
                 << std::accumulate(conf.begin(),conf.end(),0.0) / words.size();
       n_done++;
       n_words += mbr->GetOneBest().size();
@@ -158,7 +158,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall average Bayes Risk per sentence is "
               << (tot_bayes_risk / n_done) << " and per word, "
               << (tot_bayes_risk / n_words);
-    
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-to-post.cc b/src/latbin/lattice-to-post.cc
index 559fa480920..c04a6748a52 100644
--- a/src/latbin/lattice-to-post.cc
+++ b/src/latbin/lattice-to-post.cc
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
         "Do forward-backward and collect posteriors over lattices.\n"
         "Usage: lattice-to-post [options] lats-rspecifier posts-wspecifier [loglikes-wspecifier]\n"
         " e.g.: lattice-to-post --acoustic-scale=0.1 ark:1.lats ark:1.post\n"
-        "See also: lattice-to-ctm-conf, post-to-pdf-post\n";
+        "See also: lattice-to-ctm-conf, post-to-pdf-post, lattice-arc-post\n";
 
     kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
     kaldi::ParseOptions po(usage);
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
       lattice_reader.FreeCurrent();
       if (acoustic_scale != 1.0 || lm_scale != 1.0)
         fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &lat);
-      
+
       kaldi::uint64 props = lat.Properties(fst::kFstProperties, false);
       if (!(props & fst::kTopSorted)) {
         if (fst::TopSort(&lat) == false)
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
                     << " arcs. Average log-likelihood = " << (lat_like/lat_time)
                     << " over " << lat_time << " frames.  Average acoustic log-like"
                     << " per frame is " << (lat_ac_like/lat_time);
-      
-      if (loglikes_writer.IsOpen()) 
+
+      if (loglikes_writer.IsOpen())
         loglikes_writer.Write(key, lat_like);
 
       posterior_writer.Write(key, post);
diff --git a/src/latbin/nbest-to-ctm.cc b/src/latbin/nbest-to-ctm.cc
index 1993041dee6..e396f315ba1 100644
--- a/src/latbin/nbest-to-ctm.cc
+++ b/src/latbin/nbest-to-ctm.cc
@@ -1,6 +1,6 @@
 // latbin/nbest-to-ctm.cc
 
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+// Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -43,14 +43,19 @@ int main(int argc, char *argv[]) {
         "e.g.: lattice-1best --acoustic-weight=0.08333 ark:1.lats | \\\n"
         "      lattice-align-words data/lang/phones/word_boundary.int exp/dir/final.mdl ark:- ark:- | \\\n"
         "      nbest-to-ctm ark:- 1.ctm\n";
-    
+
     ParseOptions po(usage);
 
+    bool print_silence = false;
     BaseFloat frame_shift = 0.01;
     int32 precision = 2;
+    po.Register("print-silence", &print_silence, "If true, print optional-silence "
+                "(<eps>) arcs");
     po.Register("frame-shift", &frame_shift, "Time in seconds between frames.\n");
     po.Register("precision", &precision,
-                "Number of decimal places for start duration times\n");
+                "Number of decimal places for start duration times (note: we "
+                "may use a higher value than this if it's obvious from "
+                "--frame-shift that this value is too small");
 
     po.Read(argc, argv);
 
@@ -62,15 +67,21 @@ int main(int argc, char *argv[]) {
     std::string lats_rspecifier = po.GetArg(1),
         ctm_wxfilename = po.GetArg(2);
 
+    if (frame_shift < 0.01 && precision <= 2)
+      precision = 3;
+    if (frame_shift < 0.001 && precision <= 3)
+      precision = 4;
+
+
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
 
     Output ko(ctm_wxfilename, false); // false == non-binary write mode.
     ko.Stream() << std::fixed;  // Set to "fixed" floating point model, where precision() specifies
     // the #digits after the decimal point.
     ko.Stream().precision(precision);
-    
+
     for (; !clat_reader.Done(); clat_reader.Next()) {
       std::string key = clat_reader.Key();
       CompactLattice clat = clat_reader.Value();
@@ -84,7 +95,7 @@ int main(int argc, char *argv[]) {
         KALDI_ASSERT(words.size() == times.size() &&
                      words.size() == lengths.size());
         for (size_t i = 0; i < words.size(); i++) {
-          if (words[i] == 0)  // Don't output anything for <eps> links, which
+          if (words[i] == 0 && !print_silence)  // Don't output anything for <eps> links, which
             continue; // correspond to silence....
           ko.Stream() << key << " 1 " << (frame_shift * times[i]) << ' '
                       << (frame_shift * lengths[i]) << ' ' << words[i] <<std::endl;
@@ -96,7 +107,7 @@ int main(int argc, char *argv[]) {
     // we just let them go out of scope and it happens automatically.
     // We do it this time in order to avoid wrongly printing out a success message
     // if the stream was going to fail to close
-            
+
     KALDI_LOG << "Converted " << n_done << " linear lattices to ctm format; "
               << n_err  << " had errors.";
     return (n_done != 0 ? 0 : 1);
diff --git a/src/lm/Makefile b/src/lm/Makefile
index ddda9576557..acf327d994f 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -10,10 +10,10 @@ MATHLIB = NONE
 
 include ../kaldi.mk
 
-TESTFILES = lm-lib-test
+TESTFILES = arpa-file-parser-test lm-lib-test
 
-OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o kaldi-rnnlm.o \
-           mikolov-rnnlm-lib.o
+OBJFILES = arpa-file-parser.o const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o \
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
 
 TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst
 
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
new file mode 100644
index 00000000000..e37a916d263
--- /dev/null
+++ b/src/lm/arpa-file-parser-test.cc
@@ -0,0 +1,365 @@
+// lm/arpa-file-parser-test.cc
+
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file lm-lib-test.cc
+ * @brief Unit tests for language model code.
+ */
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+#include "lm/kaldi-lm.h"
+
+#include "lm/arpa-file-parser.h"
+
+namespace kaldi {
+namespace {
+
+const int kMaxOrder = 3;
+
+struct NGramTestData {
+  int32 line_number;
+  float logprob;
+  int32 words[kMaxOrder];
+  float backoff;
+};
+
+std::ostream& operator<<(std::ostream& os, const NGramTestData& data) {
+  std::ios::fmtflags saved_state(os.flags());
+  os << std::fixed << std::setprecision(6);
+
+  os << data.logprob << ' ';
+  for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' ';
+  os << data.backoff << " // Line " << data.line_number;
+
+  os.flags(saved_state);
+  return os;
+}
+
+// This does not own the array pointer, and uset to simplify passing expected
+// result to TestableArpaFileParser::Verify.
+template <class T>
+struct CountedArray {
+  template <size_t N>
+  CountedArray(T(&array)[N]) : array(array), count(N) { }
+  const T* array;
+  const size_t count;
+};
+
+template <class T, size_t N>
+inline CountedArray<T> MakeCountedArray(T(&array)[N]) {
+  return CountedArray<T>(array);
+}
+
+class TestableArpaFileParser : public ArpaFileParser {
+ public:
+  TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+      : ArpaFileParser(options, symbols),
+        header_available_(false),
+        read_complete_(false),
+        last_order_(0) { }
+  void Validate(CountedArray<int32> counts, CountedArray<NGramTestData> ngrams);
+
+ private:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
+  bool header_available_;
+  bool read_complete_;
+  int32 last_order_;
+  std::vector <NGramTestData> ngrams_;
+};
+
+void TestableArpaFileParser::HeaderAvailable() {
+  KALDI_ASSERT(!header_available_);
+  KALDI_ASSERT(!read_complete_);
+  header_available_ = true;
+  KALDI_ASSERT(NgramCounts().size() <= kMaxOrder);
+}
+
+void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  KALDI_ASSERT(ngram.words.size() <= NgramCounts().size());
+  KALDI_ASSERT(ngram.words.size() >= last_order_);
+  last_order_ = ngram.words.size();
+
+  NGramTestData entry = { 0 };
+  entry.line_number = LineNumber();
+  entry.logprob = ngram.logprob;
+  entry.backoff = ngram.backoff;
+  std::copy(ngram.words.begin(), ngram.words.end(), entry.words);
+  ngrams_.push_back(entry);
+}
+
+void TestableArpaFileParser::ReadComplete() {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  read_complete_ = true;
+}
+
+//
+bool CompareNgrams(const NGramTestData& actual,
+                   const NGramTestData& expected) {
+  if (actual.line_number != expected.line_number
+      || !std::equal(actual.words, actual.words + kMaxOrder,
+                     expected.words)
+      || !ApproxEqual(actual.logprob, expected.logprob)
+      || !ApproxEqual(actual.backoff, expected.backoff)) {
+    KALDI_WARN << "Actual n-gram [" << actual
+               << "] differs from expected [" << expected << "]";
+    return false;
+  }
+  return true;
+}
+
+void TestableArpaFileParser::Validate(
+    CountedArray<int32> expect_counts,
+    CountedArray<NGramTestData> expect_ngrams) {
+  // This needs better disagnostics probably.
+  KALDI_ASSERT(NgramCounts().size() == expect_counts.count);
+  KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(),
+                          expect_counts.array));
+
+  KALDI_ASSERT(ngrams_.size() == expect_ngrams.count);
+  // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(),
+  //                           expect_ngrams.array, CompareNgrams);
+  // if (mpos.first != ngrams_.end())
+  //   KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin();
+  //TODO:auto above requres C++11, and I cannot spell out the type!!!
+  KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(),
+                          expect_ngrams.array, CompareNgrams));
+}
+
+// Read integer LM (no symbols) with log base conversion.
+void ReadIntegerLmLogconvExpectSuccess() {
+  KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()";
+
+  static std::string integer_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.234679	4 -3.3\n\
+-3.456783	5\n\
+0.0000000	1 -2.5\n\
+-4.333333	2\n\
+\n\
+\\2-grams:\n\
+-1.45678	4 5 -3.23\n\
+-1.30490	1 4 -4.2\n\
+\n\
+\\3-grams:\n\
+-0.34958	1 4 5\n\
+-0.23940	4 5 2\n\
+\n\
+\\end\\";
+
+  int32 expect_counts[] = { 4, 2, 2 };
+  NGramTestData expect_ngrams[] = {
+    {  7, -12.05329, { 4, 0, 0 }, -7.598531 },
+    {  8, -7.959537, { 5, 0, 0 },  0.0      },
+    {  9,  0.0,      { 1, 0, 0 }, -5.756463 },
+    { 10, -9.977868, { 2, 0, 0 },  0.0      },
+
+    { 13, -3.354360, { 4, 5, 0 }, -7.437350 },
+    { 14, -3.004643, { 1, 4, 0 }, -9.670857 },
+
+    { 17, -0.804938, { 1, 4, 5 },  0.0      },
+    { 18, -0.551239, { 4, 5, 2 },  0.0      } };
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+
+  TestableArpaFileParser parser(options, NULL);
+  std::istringstream stm(integer_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_ngrams));
+}
+
+// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks.
+static std::string symbolic_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.2	a -3.3\n\
+-3.4	\xCE\xB2\n\
+0.0	<s> -2.5\n\
+-4.3	</s>\n\
+\n\
+\\2-grams:\n\
+-1.5	a \xCE\xB2 -3.2\n\
+-1.3	<s> a -4.2\n\
+\n\
+\\3-grams:\n\
+-0.3	<s> a \xCE\xB2\n\
+-0.2	<s> a </s>\n\
+\n\
+\\end\\";
+
+// Symbol table that is created with predefined test symbols, "a" but no "b".
+class TestSymbolTable : public fst::SymbolTable {
+ public:
+  TestSymbolTable() {
+    AddSymbol("<eps>", 0);
+    AddSymbol("<s>", 1);
+    AddSymbol("</s>", 2);
+    AddSymbol("<unk>", 3);
+    AddSymbol("a", 4);
+  }
+};
+
+// Full expected result shared between ReadSymbolicLmNoOovImpl and
+// ReadSymbolicLmWithOovAddToSymbols().
+NGramTestData expect_symbolic_full[] = {
+  {  7, -5.2, { 4, 0, 0 }, -3.3 },
+  {  8, -3.4, { 5, 0, 0 },  0.0 },
+  {  9,  0.0, { 1, 0, 0 }, -2.5 },
+  { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+  { 13, -1.5, { 4, 5, 0 }, -3.2 },
+  { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+  { 17, -0.3, { 1, 4, 5 },  0.0 },
+  { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  TestSymbolTable symbols;
+  symbols.AddSymbol("\xCE\xB2", 5);
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, &symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_symbolic_full));
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+}
+
+void ReadSymbolicLmNoOovTests() {
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram);
+}
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmWithOovImpl(
+    ArpaParseOptions::OovHandling oov,
+    CountedArray<NGramTestData> expect_ngrams,
+    fst::SymbolTable* symbols) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts), expect_ngrams);
+}
+
+void ReadSymbolicLmWithOovAddToSymbols() {
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols,
+                            MakeCountedArray(expect_symbolic_full),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+  KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5);
+}
+
+void ReadSymbolicLmWithOovReplaceWithUnk() {
+  NGramTestData expect_symbolic_unk_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  8, -3.4, { 3, 0, 0 },  0.0 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 13, -1.5, { 4, 3, 0 }, -3.2 },
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 17, -0.3, { 1, 4, 3 },  0.0 },
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk,
+                            MakeCountedArray(expect_symbolic_unk_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovSkipNGram() {
+  NGramTestData expect_symbolic_no_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram,
+                            MakeCountedArray(expect_symbolic_no_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovTests() {
+  KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()";
+  ReadSymbolicLmWithOovAddToSymbols();
+  KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()";
+  ReadSymbolicLmWithOovReplaceWithUnk();
+  KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()";
+  ReadSymbolicLmWithOovSkipNGram();
+}
+
+}  // namespace
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  kaldi::ReadIntegerLmLogconvExpectSuccess();
+  kaldi::ReadSymbolicLmNoOovTests();
+  kaldi::ReadSymbolicLmWithOovTests();
+}
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
new file mode 100644
index 00000000000..2d8f9f18638
--- /dev/null
+++ b/src/lm/arpa-file-parser.cc
@@ -0,0 +1,236 @@
+// lm/arpa-file-parser.cc
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include <fst/fstlib.h>
+
+#include "base/kaldi-error.h"
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+    : options_(options), symbols_(symbols), line_number_(0) {
+}
+
+ArpaFileParser::~ArpaFileParser() {
+}
+
+void ArpaFileParser::Read(std::istream &is, bool binary) {
+  if (binary) {
+    KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
+  }
+
+  // Argument sanity checks.
+  if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 ||
+      options_.bos_symbol == options_.eos_symbol)
+    KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and "
+              << "differ from each other. Given:"
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL &&
+      options_.oov_handling == ArpaParseOptions::kReplaceWithUnk &&
+      (options_.unk_symbol <= 0 ||
+       options_.unk_symbol == options_.bos_symbol ||
+       options_.unk_symbol == options_.eos_symbol))
+    KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, "
+              << "UNK symbol is required, must not be epsilon, and "
+              << "differ from both BOS and EOS symbols. Given:"
+              << " UNK=" << options_.unk_symbol
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty())
+    KALDI_ERR << "BOS symbol must exist in symbol table";
+  if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty())
+    KALDI_ERR << "EOS symbol must exist in symbol table";
+  if (symbols_ != NULL && options_.unk_symbol > 0 &&
+      symbols_->Find(options_.unk_symbol).empty())
+    KALDI_ERR << "UNK symbol must exist in symbol table";
+
+  ngram_counts_.clear();
+  line_number_ = 0;
+
+#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ")
+
+  // Give derived class an opportunity to prepare its state.
+  ReadStarted();
+
+  std::string line;
+
+  // Processes "\data\" section.
+  bool keyword_found = false;
+  while (++line_number_, getline(is, line) && !is.eof()) {
+    if (line.empty()) continue;
+
+    // The section keywords starts with backslash. We terminate the while loop
+    // if a new section is found.
+    if (line[0] == '\\') {
+      if (!keyword_found && line == "\\data\\") {
+        KALDI_LOG << "Reading \\data\\ section.";
+        keyword_found = true;
+        continue;
+      }
+      break;
+    }
+
+    if (!keyword_found) continue;
+
+    // Enters "\data\" section, and looks for patterns like "ngram 1=1000",
+    // which means there are 1000 unigrams.
+    std::size_t equal_symbol_pos = line.find("=");
+    if (equal_symbol_pos != std::string::npos)
+      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
+    std::vector<std::string> col;
+    SplitStringToVector(line, " \t", true, &col);
+    if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") {
+      int32 order, ngram_count = 0;
+      if (!ConvertStringToInteger(col[1], &order) ||
+          !ConvertStringToInteger(col[3], &ngram_count)) {
+        PARSE_ERR << "Cannot parse ngram count '" << line << "'.";
+      }
+      if (ngram_counts_.size() <= order) {
+        ngram_counts_.resize(order);
+      }
+      ngram_counts_[order - 1] = ngram_count;
+    } else {
+      KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line;
+    }
+  }
+
+  if (ngram_counts_.size() == 0)
+    PARSE_ERR << "\\data\\ section missing or empty.";
+
+  // Signal that grammar order and n-gram counts are known.
+  HeaderAvailable();
+
+  NGram ngram;
+  ngram.words.reserve(ngram_counts_.size());
+
+  // Processes "\N-grams:" section.
+  for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) {
+    // Skips n-grams with zero count.
+    if (ngram_counts_[cur_order - 1] == 0) {
+      KALDI_WARN << "Zero ngram count in ngram order " << cur_order
+                 << "(look for 'ngram " << cur_order << "=0' in the \\data\\ "
+                 << " section). There is possibly a problem with the file.";
+      continue;
+    }
+
+    // Must be looking at a \k-grams: directive at this point.
+    std::ostringstream keyword;
+    keyword << "\\" << cur_order << "-grams:";
+    if (line != keyword.str()) {
+      PARSE_ERR << "Invalid directive '" << line << "', "
+                << "expecting '" << keyword.str() << "'.";
+    }
+    KALDI_LOG << "Reading " << line << " section.";
+
+    int32 ngram_count = 0;
+    while (++line_number_, getline(is, line) && !is.eof()) {
+      if (line.empty()) continue;
+      if (line[0] == '\\') break;
+
+      std::vector<std::string> col;
+      SplitStringToVector(line, " \t", true, &col);
+
+      if (col.size() < 1 + cur_order ||
+          col.size() > 2 + cur_order ||
+          (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) {
+        PARSE_ERR << "Invalid n-gram line '"  << line << "'";
+      }
+      ++ngram_count;
+
+      // Parse out n-gram logprob and, if present, backoff weight.
+      if (!ConvertStringToReal(col[0], &ngram.logprob)) {
+        PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'.";
+      }
+      ngram.backoff = 0.0;
+      if (col.size() > cur_order + 1) {
+        if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff))
+          PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'.";
+      }
+      // Convert to natural log unless the option is set not to.
+      if (!options_.use_log10) {
+        ngram.logprob *= M_LN10;
+        ngram.backoff *= M_LN10;
+      }
+
+      ngram.words.resize(cur_order);
+      bool skip_ngram = false;
+      for (int32 index = 0; !skip_ngram && index < cur_order; ++index) {
+        int32 word;
+        if (symbols_) {
+          // Symbol table provided, so symbol labels are expected.
+          if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) {
+            word = symbols_->AddSymbol(col[1 + index]);
+          } else {
+            word = symbols_->Find(col[1 + index]);
+            if (word == fst::SymbolTable::kNoSymbol) {
+              switch(options_.oov_handling) {
+                case ArpaParseOptions::kReplaceWithUnk:
+                  word = options_.unk_symbol;
+                  break;
+                case ArpaParseOptions::kSkipNGram:
+                  skip_ngram = true;
+                  break;
+                default:
+                  PARSE_ERR << "Word '"  << col[1 + index]
+                            << "' not in symbol table.";
+              }
+            }
+          }
+        } else {
+          // Symbols not provided, LM file should contain integers.
+          if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) {
+            PARSE_ERR << "invalid symbol '" << col[1 + index] << "'";
+          }
+        }
+        // Whichever way we got it, an epsilon is invalid.
+        if (word == 0) {
+          PARSE_ERR << "Epsilon symbol '" << col[1 + index]
+                    << "' is illegal in ARPA LM.";
+        }
+        ngram.words[index] = word;
+      }
+      if (!skip_ngram) {
+        ConsumeNGram(ngram);
+      }
+    }
+    if (ngram_count > ngram_counts_[cur_order - 1]) {
+      PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order]
+                << " n-grams of order " << cur_order << ", but we saw "
+                << ngram_count;
+    }
+  }
+
+  if (line != "\\end\\") {
+    PARSE_ERR << "Invalid or unexpected directive line '" << line << "', "
+              << "expected \\end\\.";
+  }
+
+  ReadComplete();
+
+#undef PARSE_ERR
+}
+
+}  // namespace kaldi
diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h
new file mode 100644
index 00000000000..0011fb4ee21
--- /dev/null
+++ b/src/lm/arpa-file-parser.h
@@ -0,0 +1,125 @@
+// lm/arpa-file-parser.h
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_ARPA_FILE_PARSER_H_
+#define KALDI_LM_ARPA_FILE_PARSER_H_
+
+#include <string>
+#include <vector>
+
+#include <fst/fst-decl.h>
+
+#include "base/kaldi-types.h"
+
+namespace kaldi {
+
+/**
+  Options that control ArpaFileParser
+*/
+struct ArpaParseOptions {
+  enum OovHandling {
+    kRaiseError,     ///< Abort on OOV words
+    kAddToSymbols,   ///< Add novel words to the symbol table.
+    kReplaceWithUnk, ///< Replace OOV words with <unk>.
+    kSkipNGram       ///< Skip n-gram with OOV word and continue.
+  };
+
+  ArpaParseOptions()
+      : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1),
+        oov_handling(kRaiseError), use_log10(false) { }
+
+  int32 bos_symbol;  ///< Symbol for <s>, Required non-epsilon.
+  int32 eos_symbol;  ///< Symbol for </s>, Required non-epsilon.
+  int32 unk_symbol;  ///< Symbol for <unk>, Required for kReplaceWithUnk.
+  OovHandling oov_handling;  ///< How to handle OOV words in the file.
+  bool use_log10;    ///< Use log10 for prob and backoff weight, not ln.
+};
+
+/**
+   A parsed n-gram from ARPA LM file.
+*/
+struct NGram {
+  NGram() : logprob(0.0), backoff(0.0) { }
+  std::vector<int32> words;  ///< Symbols in LTR order.
+  float logprob;             ///< Log-prob of the n-gram.
+  float backoff;             ///< log-backoff weight of the n-gram.
+};
+
+/**
+    ArpaFileParser is an abstract base class for ARPA LM file conversion.
+
+    See ConstArpaLmBuilder for a usage example.
+*/
+class ArpaFileParser {
+ public:
+  /// Constructs the parser with the given options and optional symbol table.
+  /// If symbol table is provided, then the file should contain text n-grams,
+  /// and the words are mapped to symbols through it. bos_symbol and
+  /// eos_symbol in the options structure must be valid symbols in the table,
+  /// and so must be unk_symbol if provided. The table is not owned by the
+  /// parser, but may be augmented, if oov_handling is set to kAddToSymbols.
+  /// If symbol table is a null pointer, the file should contain integer
+  /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol
+  /// must be valid symbols still.
+  ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols);
+  virtual ~ArpaFileParser();
+
+  /// Read ARPA LM file through Kaldi I/O functions. Only text mode is
+  /// supported.
+  void Read(std::istream &is, bool binary);
+
+  const ArpaParseOptions& Options() const { return options_; }
+
+ protected:
+  /// Override called before reading starts. This is the point to prepare
+  /// any state in the derived class.
+  virtual void ReadStarted() { }
+
+  /// Override function called to signal that ARPA header with the expected
+  /// number of n-grams has been read, and ngram_counts() is now valid.
+  virtual void HeaderAvailable() { }
+
+  /// Pure override that must be implemented to process current n-gram. The
+  /// n-grams are sent in the file order, which guarantees that all
+  /// (k-1)-grams are processed before the first k-gram is.
+  virtual void ConsumeNGram(const NGram&) = 0;
+
+  /// Override function called after the last n-gram has been consumed.
+  virtual void ReadComplete() { }
+
+  /// Read-only access to symbol table.
+  const fst::SymbolTable* Symbols() const { return symbols_; }
+
+  /// Inside ConsumeNGram(), provides the current line number.
+  int32 LineNumber() const { return line_number_; }
+
+  /// N-gram counts. Valid in and after a call to HeaderAvailable().
+  const std::vector<int32>& NgramCounts() const { return ngram_counts_; }
+
+ private:
+  ArpaParseOptions options_;
+  fst::SymbolTable* symbols_;  // Not owned.
+  int32 line_number_;
+  std::vector<int32> ngram_counts_;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_ARPA_FILE_PARSER_H_
diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc
index 7f63dce886e..5043933d7f0 100644
--- a/src/lm/const-arpa-lm.cc
+++ b/src/lm/const-arpa-lm.cc
@@ -22,13 +22,14 @@
 #include <sstream>
 #include <utility>
 
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
 #include "lm/const-arpa-lm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
-#include "base/kaldi-math.h"
 
-namespace kaldi {
 
+namespace kaldi {
 
 // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
 // format.
@@ -173,13 +174,10 @@ class LmState {
 
 // Class to build ConstArpaLm from Arpa format language model. It relies on the
 // auxiliary class LmState above.
-class ConstArpaLmBuilder {
+class ConstArpaLmBuilder : public ArpaFileParser {
  public:
-  ConstArpaLmBuilder(
-      const bool natural_base, const int32 bos_symbol,
-      const int32 eos_symbol, const int32 unk_symbol) :
-      natural_base_(natural_base), bos_symbol_(bos_symbol),
-      eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) {
+  ConstArpaLmBuilder(ArpaParseOptions options)
+      : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
@@ -204,21 +202,21 @@ class ConstArpaLmBuilder {
     }
   }
 
-  // Reads in the Arpa format language model, parses it and creates LmStates.
-  void Read(std::istream &is, bool binary);
-
   // Writes ConstArpaLm.
   void Write(std::ostream &os, bool binary) const;
 
-  // Builds ConstArpaLm.
-  void Build();
-
   void SetMaxAddressOffset(const int32 max_address_offset) {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }
 
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
  private:
   struct WordsAndLmStatePairLessThan {
     bool operator()(
@@ -229,10 +227,6 @@ class ConstArpaLmBuilder {
   };
 
  private:
-  // If true, use natural base e for log-prob, otherwise use base 10. The
-  // default base in Arpa format language model is base 10.
-  bool natural_base_;
-
   // Indicating if ConstArpaLm has been built or not.
   bool is_built_;
 
@@ -240,16 +234,6 @@ class ConstArpaLmBuilder {
   // The default value is 30-bits and should not be changed except for testing.
   int32 max_address_offset_;
 
-  // Integer corresponds to <s>.
-  int32 bos_symbol_;
-
-  // Integer corresponds to </s>.
-  int32 eos_symbol_;
-
-  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
-  // provided.
-  int32 unk_symbol_;
-
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
@@ -280,201 +264,58 @@ class ConstArpaLmBuilder {
                 LmState*, VectorHasher<int32> > seq_to_state_;
 };
 
-// Reads in the Arpa format language model, parses it and puts the word sequence
-// into the corresponding LmState in <seq_to_state_>.
-void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
-  if (binary) {
-    KALDI_ERR << "binary-mode reading is not implemented for "
-        << "ConstArpaLmBuilder.";
-  }
-
-  std::string line;
-
-  // Number of n-grams from "\data\" section. Those numbers should match the
-  // actual number of n-grams from "\N-grams:" sections.
-  // Note that when we convert the words in the Arpa format language model into
-  // integers, we remove lines with OOV words. We also modify the n-gram counts
-  // in "\data\" correspondingly.
-  std::vector<int32> num_ngrams;
-
-  // Processes "\data\" section.
-  bool keyword_found = false;
-  while (getline(is, line) && !is.eof()) {
-    // The section keywords starts with backslash. We terminate the while loop
-    // if a new section is found.
-    if (!line.empty() && line[0] == '\\') {
-      if (line.find("-grams:") != std::string::npos) break;
-      if (line.find("\\end\\") != std::string::npos) break;
-    }
-
-    std::size_t equal_symbol_pos = line.find("=");
-    if (equal_symbol_pos != std::string::npos)
-      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
-    std::vector<std::string> col;
-    SplitStringToVector(line, " \t", true, &col);
-
-    // Looks for keyword "\data\".
-    if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") {
-      KALDI_LOG << "Reading \"\\data\\\" section.";
-      keyword_found = true;
-      continue;
-    }
+void ConstArpaLmBuilder::HeaderAvailable() {
+  ngram_order_ = NgramCounts().size();
+}
 
-    // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which
-    // means there are 1000 unigrams.
-    if (keyword_found && col.size() == 4 && col[0] == "ngram") {
-      if (col[2] == "=") {
-        int32 order, ngram_count;
-        if (!ConvertStringToInteger(col[1], &order)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[1] << " to integer.";
-        }
-        if (!ConvertStringToInteger(col[3], &ngram_count)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[3] << " to integer.";
-        }
-        if (num_ngrams.size() <= order) {
-          num_ngrams.resize(order + 1);
-        }
-        num_ngrams[order] = ngram_count;
-      } else {
-        KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-      }
-    } else if (keyword_found) {
-      KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-    }
+void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) {
+  int32 cur_order = ngram.words.size();
+  // If <ngram_order_> is larger than 1, then we do not create LmState for
+  // the final order entry. We only keep the log probability for it.
+  LmState *lm_state = NULL;
+  if (cur_order != ngram_order_ || ngram_order_ == 1) {
+    lm_state = new LmState(cur_order == 1,
+                           cur_order == ngram_order_ - 1,
+                           ngram.logprob, ngram.backoff);
+
+    KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end());
+    seq_to_state_[ngram.words] = lm_state;
   }
-  if (num_ngrams.size() == 0)
-    KALDI_ERR << "Fail to read \"\\data\\\" section.";
-  ngram_order_ = num_ngrams.size() - 1;
-
-  // Processes "\N-grams:" section.
-  int32 max_word_id = 0;
-  for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) {
-    // Skips n-grams with zero count.
-    if (num_ngrams[cur_order] == 0) continue;
-
-    keyword_found = false;
-    int32 ngram_count = 0;
-    std::ostringstream keyword;
-    keyword << "\\" << cur_order << "-grams:";
-    // We use "do ... while" loop since one line has already been read.
-    do {
-      // The section keywords starts with backslash. We terminate the while loop
-      // if a new section is found.
-      if (!line.empty() && line[0] == '\\') {
-        if (line.find("-grams:") != std::string::npos && keyword_found) break;
-        if (line.find("\\end\\") != std::string::npos) break;
-      }
 
-      std::vector<std::string> col;
-      SplitStringToVector(line, " \t", true, &col);
-
-      // Looks for keyword "\N-gram:" if the keyword has not been located.
-      if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) {
-        KALDI_LOG << "Reading \"" << keyword.str() << "\" section.";
-        ngram_count = 0;
-        keyword_found = true;
-        continue;
-      }
-
-      // Enters "\N-grams:" section if the keyword has been located.
-      if (keyword_found && col.size() > 0) {
-        KALDI_ASSERT(col.size() >= 1 + cur_order);
-        KALDI_ASSERT(col.size() <= 2 + cur_order);  // backoff_logprob can be 0.
-        if (cur_order == ngram_order_ && col.size() == 2 + cur_order) {
-          KALDI_ERR << "Backoff probability detected for final-order entry \""
-              << line << "\".";
-        }
-        ngram_count++;
-
-        // If backoff_logprob is 0, it will not appear in Arpa format language
-        // model. We put it back so the processing afterwards will be easier.
-        if (col.size() == 1 + cur_order) {
-          col.push_back("0");
-        }
-
-        // Creates LmState for the current word sequence.
-        bool is_unigram = (cur_order == 1) ? true : false;
-        float logprob;
-        float backoff_logprob;
-        KALDI_ASSERT(ConvertStringToReal(col[0], &logprob));
-        KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob));
-        if (natural_base_) {
-          logprob *= Log(10.0f);
-          backoff_logprob *= Log(10.0f);
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not create LmState for
-        // the final order entry. We only keep the log probability for it.
-        LmState *lm_state = NULL;
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          lm_state = new LmState(is_unigram,
-                                 (cur_order == ngram_order_ - 1),
-                                 logprob, backoff_logprob);
-        }
-
-        // Figures out the sequence of words.
-        std::vector<int32> seq(cur_order, 0);
-        for (int32 index = 0; index < cur_order; ++index) {
-          int32 word;
-          if (!ConvertStringToInteger(col[1 + index], &word)) {
-            KALDI_ERR << "bad line: " << line << "; fail to convert "
-                << col[1 + index] << " to integer.";
-          }
-          seq[index] = word;
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not insert LmState to
-        // <seq_to_state_>.
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          KALDI_ASSERT(lm_state != NULL);
-          KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end());
-          seq_to_state_[seq] = lm_state;
-        }
-
-        // If n-gram order is larger than 1, we have to add possible child to
-        // existing LmStates. We have the following two assumptions:
-        // 1. N-grams are processed from small order to larger ones, i.e., from
-        //    1, 2, ... to the highest order.
-        // 2. If a n-gram exists in the Arpa format language model, then the
-        //    "history" n-gram also exists. For example, if "A B C" is a valid
-        //    n-gram, then "A B" is also a valid n-gram.
-        if (cur_order > 1) {
-          std::vector<int32> hist(seq.begin(), seq.begin() + cur_order - 1);
-          int32 word = seq[seq.size() - 1];
-          unordered_map<std::vector<int32>,
-                        LmState*, VectorHasher<int32> >::iterator hist_iter;
-          hist_iter = seq_to_state_.find(hist);
-          KALDI_ASSERT(hist_iter != seq_to_state_.end());
-          if (cur_order != ngram_order_ || ngram_order_ == 1) {
-            KALDI_ASSERT(lm_state != NULL);
-            KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, lm_state);
-          } else {
-            KALDI_ASSERT(lm_state == NULL);
-            KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, logprob);
-          }
-        } else {
-          // Figures out <max_word_id>.
-          KALDI_ASSERT(seq.size() == 1);
-          if (seq[0] > max_word_id) {
-            max_word_id = seq[0];
-          }
-        }
-      }
-    } while (getline(is, line) && !is.eof());
-    if (ngram_count > num_ngrams[cur_order] ||
-        (ngram_count == 0 && num_ngrams[cur_order] != 0)) {
-      KALDI_ERR << "Header said there would be " << num_ngrams[cur_order]
-                << " n-grams of order " << cur_order << ", but we saw "
-                << ngram_count;
+  // If n-gram order is larger than 1, we have to add possible child to
+  // existing LmStates. We have the following two assumptions:
+  // 1. N-grams are processed from small order to larger ones, i.e., from
+  //    1, 2, ... to the highest order.
+  // 2. If a n-gram exists in the Arpa format language model, then the
+  //    "history" n-gram also exists. For example, if "A B C" is a valid
+  //    n-gram, then "A B" is also a valid n-gram.
+  int32 last_word = ngram.words[cur_order - 1];
+  if (cur_order > 1) {
+    std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
+    unordered_map<std::vector<int32>,
+                  LmState*, VectorHasher<int32> >::iterator hist_iter;
+    hist_iter = seq_to_state_.find(hist);
+    if (hist_iter == seq_to_state_.end()) {
+      std::ostringstream ss;
+      for (int i = 0; i < cur_order; ++i)
+        ss << (i == 0 ? '[' : ' ') << ngram.words[i];
+      KALDI_ERR << "In line " << LineNumber() << ": "
+                << cur_order << "-gram " << ss.str() << "] does not have "
+                << "a parent model " << cur_order << "-gram.";
+    }
+    if (cur_order != ngram_order_ || ngram_order_ == 1) {
+      KALDI_ASSERT(lm_state != NULL);
+      KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, lm_state);
+    } else {
+      KALDI_ASSERT(lm_state == NULL);
+      KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, ngram.logprob);
     }
+  } else {
+    // Figures out <max_word_id>.
+    num_words_ = std::max(num_words_, last_word + 1);
   }
-
-  // <num_words_> is <max_word_id> plus 1.
-  num_words_ = max_word_id + 1;
 }
 
 // ConstArpaLm can be built in the following steps, assuming we have already
@@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
 //    At the same time, we will also create two special buffers:
 //    <unigram_states_>
 //    <overflow_buffer_>
-void ConstArpaLmBuilder::Build() {
+void ConstArpaLmBuilder::ReadComplete() {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
@@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(is_built_);
 
   // Creates ConstArpaLm.
-  ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_,
-                            num_words_, overflow_buffer_size_, lm_states_size_,
-                            unigram_states_, overflow_buffer_, lm_states_);
+  ConstArpaLm const_arpa_lm(
+      Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
+      ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
+      unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }
 
@@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol,
                       const int32 eos_symbol, const int32 unk_symbol,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename) {
-  ConstArpaLmBuilder lm_builder(natural_base, bos_symbol,
-                                eos_symbol, unk_symbol);
+  ArpaParseOptions options;
+  options.bos_symbol = bos_symbol;
+  options.eos_symbol = eos_symbol;
+  options.unk_symbol = unk_symbol;
+  options.use_log10 = !natural_base;
+
+  ConstArpaLmBuilder lm_builder(options);
+  KALDI_LOG << "Reading " << arpa_rxfilename;
   ReadKaldiObject(arpa_rxfilename, &lm_builder);
-  lm_builder.Build();
   WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
   return true;
 }
diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc
index e1fbcbdc08b..3a811c4c0e5 100644
--- a/src/lm/kaldi-rnnlm.cc
+++ b/src/lm/kaldi-rnnlm.cc
@@ -58,8 +58,8 @@ KaldiRnnlmWrapper::KaldiRnnlmWrapper(
 
 BaseFloat KaldiRnnlmWrapper::GetLogProb(
     int32 word, const std::vector<int32> &wseq,
-    const std::vector<BaseFloat> &context_in,
-    std::vector<BaseFloat> *context_out) {
+    const std::vector<float> &context_in,
+    std::vector<float> *context_out) {
 
   std::vector<std::string> wseq_symbols(wseq.size());
   for (int32 i = 0; i < wseq_symbols.size(); ++i) {
@@ -79,7 +79,7 @@ RnnlmDeterministicFst::RnnlmDeterministicFst(int32 max_ngram_order,
 
   // Uses empty history for <s>.
   std::vector<Label> bos;
-  std::vector<BaseFloat> bos_context(rnnlm->GetHiddenLayerSize(), 1.0f);
+  std::vector<float> bos_context(rnnlm->GetHiddenLayerSize(), 1.0);
   state_to_wseq_.push_back(bos);
   state_to_context_.push_back(bos_context);
   wseq_to_state_[bos] = 0;
@@ -101,7 +101,7 @@ bool RnnlmDeterministicFst::GetArc(StateId s, Label ilabel, fst::StdArc *oarc) {
   KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
 
   std::vector<Label> wseq = state_to_wseq_[s];
-  std::vector<BaseFloat> new_context(rnnlm_->GetHiddenLayerSize());
+  std::vector<float> new_context(rnnlm_->GetHiddenLayerSize());
   BaseFloat logprob = rnnlm_->GetLogProb(ilabel, wseq,
                                          state_to_context_[s], &new_context);
 
diff --git a/src/lm/kaldi-rnnlm.h b/src/lm/kaldi-rnnlm.h
index 5db1e7bc997..2383058a1a8 100644
--- a/src/lm/kaldi-rnnlm.h
+++ b/src/lm/kaldi-rnnlm.h
@@ -56,8 +56,8 @@ class KaldiRnnlmWrapper {
   int32 GetEos() const { return eos_; }
 
   BaseFloat GetLogProb(int32 word, const std::vector<int32> &wseq,
-                       const std::vector<BaseFloat> &context_in,
-                       std::vector<BaseFloat> *context_out);
+                       const std::vector<float> &context_in,
+                       std::vector<float> *context_out);
 
  private:
   rnnlm::CRnnLM rnnlm_;
@@ -96,7 +96,7 @@ class RnnlmDeterministicFst
 
   KaldiRnnlmWrapper *rnnlm_;
   int32 max_ngram_order_;
-  std::vector<std::vector<BaseFloat> > state_to_context_;
+  std::vector<std::vector<float> > state_to_context_;
 };
 
 }  // namespace kaldi
diff --git a/src/makefiles/linux_cuda.mk b/src/makefiles/cuda_32bit.mk
similarity index 83%
rename from src/makefiles/linux_cuda.mk
rename to src/makefiles/cuda_32bit.mk
index 502bf0ffc03..c89bf2e409d 100644
--- a/src/makefiles/linux_cuda.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,8 +1,6 @@
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA
-
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
 LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
-
diff --git a/src/makefiles/linux_x86_64_cuda.mk b/src/makefiles/cuda_64bit.mk
similarity index 55%
rename from src/makefiles/linux_x86_64_cuda.mk
rename to src/makefiles/cuda_64bit.mk
index 46613083188..25400f452f8 100644
--- a/src/makefiles/linux_x86_64_cuda.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,14 +1,7 @@
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA
-
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
-UNAME := $(shell uname)
-#aware of fact in cuda60 there is no lib64, just lib.
-ifeq ($(UNAME), Darwin)
-CUDA_LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
-else
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-endif
 CUDA_LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
 
diff --git a/src/matrix/kaldi-matrix-inl.h b/src/matrix/kaldi-matrix-inl.h
index 8bc4749b703..c2ff00793f8 100644
--- a/src/matrix/kaldi-matrix-inl.h
+++ b/src/matrix/kaldi-matrix-inl.h
@@ -35,7 +35,7 @@ void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra
 
 template<>
 template<>
-void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb); 
+void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
 
 template<typename Real>
 inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
@@ -60,3 +60,4 @@ inline std::istream & operator >> (std::istream & is, MatrixBase<Real> & M) {
 
 
 #endif  // KALDI_MATRIX_KALDI_MATRIX_INL_H_
+
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 76b83ea7114..ca34787de61 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -628,7 +628,8 @@ Matrix<double>::Matrix(const MatrixBase<float> & M,
 
 template<typename Real>
 inline void Matrix<Real>::Init(const MatrixIndexT rows,
-                               const MatrixIndexT cols) {
+                               const MatrixIndexT cols,
+                               const MatrixStrideType stride_type) {
   if (rows * cols == 0) {
     KALDI_ASSERT(rows == 0 && cols == 0);
     this->num_rows_ = 0;
@@ -638,9 +639,7 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     return;
   }
   KALDI_ASSERT(rows > 0 && cols > 0);
-  // initialize some helping vars
-  MatrixIndexT skip;
-  MatrixIndexT real_cols;
+  MatrixIndexT skip, stride;
   size_t size;
   void *data;  // aligned memory block
   void *temp;  // memory block to be really freed
@@ -648,8 +647,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
   // compute the size of skip and real cols
   skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
       % (16 / sizeof(Real));
-  real_cols = cols + skip;
-  size = static_cast<size_t>(rows) * static_cast<size_t>(real_cols)
+  stride = cols + skip;
+  size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
       * sizeof(Real);
 
   // allocate the memory and set the right dimensions and parameters
@@ -657,7 +656,7 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     MatrixBase<Real>::data_        = static_cast<Real *> (data);
     MatrixBase<Real>::num_rows_      = rows;
     MatrixBase<Real>::num_cols_      = cols;
-    MatrixBase<Real>::stride_  = real_cols;
+    MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
   } else {
     throw std::bad_alloc();
   }
@@ -666,7 +665,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
 template<typename Real>
 void Matrix<Real>::Resize(const MatrixIndexT rows,
                           const MatrixIndexT cols,
-                          MatrixResizeType resize_type) {
+                          MatrixResizeType resize_type,
+                          MatrixStrideType stride_type) {
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
@@ -699,7 +699,7 @@ void Matrix<Real>::Resize(const MatrixIndexT rows,
     else
       Destroy();
   }
-  Init(rows, cols);
+  Init(rows, cols, stride_type);
   if (resize_type == kSetZero) MatrixBase<Real>::SetZero();
 }
 
@@ -912,7 +912,7 @@ void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
     const Real *v_inc_data = rv.Data();
     Real *m_inc_data = data_;
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
-      BaseFloat value = *(v_inc_data++);
+      Real value = *(v_inc_data++);
       for (MatrixIndexT c = 0; c < num_cols_; c++)
         m_inc_data[c] = value;
       m_inc_data += stride_;
@@ -1969,6 +1969,19 @@ void MatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
+  }
+}
+
 
 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index c16ffb22135..5b4216002fb 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -457,6 +457,12 @@ class MatrixBase {
   /// Set each element to the sigmoid of the corresponding element of "src".
   void Sigmoid(const MatrixBase<Real> &src);
 
+  /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the
+  /// corresponding element in "src".  Note: in general you can make different
+  /// choices for x = 0, but for now please leave it as it (i.e. returning zero)
+  /// because it affects the RectifiedLinearComponent in the neural net code.
+  void Heaviside(const MatrixBase<Real> &src);
+
   /// Set each element to y = log(1 + exp(x))
   void SoftHinge(const MatrixBase<Real> &src);
 
@@ -738,11 +744,11 @@ class Matrix : public MatrixBase<Real> {
   /// Empty constructor.
   Matrix();
 
-  /// Basic constructor.  Sets to zero by default.
-  /// if set_zero == false, memory contents are undefined.
+  /// Basic constructor.
   Matrix(const MatrixIndexT r, const MatrixIndexT c,
-         MatrixResizeType resize_type = kSetZero):
-      MatrixBase<Real>() { Resize(r, c, resize_type); }
+         MatrixResizeType resize_type = kSetZero,
+         MatrixStrideType stride_type = kDefaultStride):
+      MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
 
   /// Copy constructor from CUDA matrix
   /// This is defined in ../cudamatrix/cu-matrix.h
@@ -814,10 +820,16 @@ class Matrix : public MatrixBase<Real> {
   ///   -if kUndefined, the new data will be undefined
   ///   -if kCopyData, the new data will be the same as the old data in any
   ///      shared positions, and zero elsewhere.
+  ///
+  /// You can set stride_type to kStrideEqualNumCols to force the stride
+  /// to equal the number of columns; by default it is set so that the stride
+  /// in bytes is a multiple of 16.
+  ///
   /// This function takes time proportional to the number of data elements.
   void Resize(const MatrixIndexT r,
               const MatrixIndexT c,
-              MatrixResizeType resize_type = kSetZero);
+              MatrixResizeType resize_type = kSetZero,
+              MatrixStrideType stride_type = kDefaultStride);
 
   /// Assignment operator that takes MatrixBase.
   Matrix<Real> &operator = (const MatrixBase<Real> &other) {
@@ -847,7 +859,8 @@ class Matrix : public MatrixBase<Real> {
   /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
   /// memory contents will be undefined.
   void Init(const MatrixIndexT r,
-            const MatrixIndexT c);
+            const MatrixIndexT c,
+            const MatrixStrideType stride_type);
 
 };
 /// @} end "addtogroup matrix_group"
diff --git a/src/matrix/matrix-common.h b/src/matrix/matrix-common.h
index cca65ae7bbd..a9a1b02a48b 100644
--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@@ -38,6 +38,12 @@ typedef enum {
   kCopyData
 } MatrixResizeType;
 
+
+typedef enum {
+  kDefaultStride,
+  kStrideEqualNumCols,
+} MatrixStrideType;
+
 typedef enum {
   kTakeLower,
   kTakeUpper,
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index e65f299cf3d..0a8322859bb 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -738,7 +738,8 @@ static void UnitTestCopyRows() {
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<int32> reorder(num_rows2);
-    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    std::vector<const Real*> reorder_src(num_rows2,
+                                         static_cast<const Real*>(NULL));
     for (int32 i = 0; i < num_rows2; i++) {
       reorder[i] = -1 + (Rand() % (num_rows1 + 1));
       if (reorder[i] != -1)
@@ -768,7 +769,8 @@ static void UnitTestCopyToRows() {
     InitRand(&M);
 
     Matrix<Real> N(num_rows2, num_cols), O(num_rows2, num_cols);
-    std::vector<Real*> reorder_dst(num_rows1, NULL);
+    std::vector<Real*> reorder_dst(num_rows1,
+                                   static_cast<Real*>(NULL));
     unordered_map<MatrixIndexT, bool> used_index;
     for (int32 i = 0; i < num_rows1; i++) {
       MatrixIndexT index = -1 + (Rand() % (num_rows2 + 1));
@@ -802,7 +804,8 @@ static void UnitTestAddRows() {
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<int32> reorder(num_rows2);
-    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    std::vector<const Real*> reorder_src(num_rows2,
+                                         static_cast<const Real*>(NULL));
     for (int32 i = 0; i < num_rows2; i++) {
       reorder[i] = -1 + (Rand() % (num_rows1 + 1));
       if (reorder[i] != -1)
@@ -841,7 +844,7 @@ static void UnitTestAddToRows() {
         static_cast<Real>((Rand() % num_rows2)) / static_cast<Real>(num_rows1);
 
     Matrix<Real> N(num_rows2, num_cols), O(num_rows2, num_cols);
-    std::vector<Real*> reorder_dst(num_rows1, NULL);
+    std::vector<Real*> reorder_dst(num_rows1, static_cast<Real*>(NULL));
     unordered_map<MatrixIndexT, bool> used_index;
     for (int32 i = 0; i < num_rows1; i++) {
       MatrixIndexT index = -1 + (Rand() % (num_rows2 + 1));
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index 1b456501b5c..9c03738cf1c 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -3933,8 +3933,8 @@ void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
   }
   
   // apply all filters
-  AddMatMatBatched(1.0f, tgt_batch, patch_batch, kNoTrans, filter_params_batch,
-		  kTrans, 1.0f);
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans, filter_params_batch,
+		  kTrans, 1.0);
 
   // release memory
   delete filter_params_elem;
@@ -4060,8 +4060,8 @@ void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
 				    p * num_filters, num_filters)));
     filter_params_batch.push_back(filter_params_elem);  
   }
-  AddMatMatBatched(1.0f, patch_deriv_batch, out_deriv_batch, kNoTrans, 
-		  filter_params_batch, kNoTrans, 0.0f);
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans, 
+		  filter_params_batch, kNoTrans, 0.0);
 
   // release memory
   delete filter_params_elem;
@@ -4275,8 +4275,8 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
 				    p * filter_dim, filter_dim)));
   }
 
-  AddMatMatBatched(1.0f, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
-		  kNoTrans, 1.0f);
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
+		  kNoTrans, 1.0);
 
   // add the row blocks together to filters_grad
   filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
diff --git a/src/nnet2/nnet-example-functions.h b/src/nnet2/nnet-example-functions.h
index fac48b2f383..82c86dfc046 100644
--- a/src/nnet2/nnet-example-functions.h
+++ b/src/nnet2/nnet-example-functions.h
@@ -107,7 +107,7 @@ struct SplitDiscriminativeExampleConfig {
     // See "Sequence-discriminative training of deep neural networks", Vesely et al,
     // ICASSP 2013 for explanation of frame dropping.
     opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den frames");
+                   "with no overlap of num and den pdf-ids");
     opts->Register("split", &split, "Set to false to disable lattice-splitting.");
     opts->Register("excise", &excise, "Set to false to disable excising un-needed "
                    "frames (option included for debug purposes)");
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
index 193bf06c84a..ad889be5c55 100644
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ b/src/nnet2/nnet-precondition-online-test.cc
@@ -307,7 +307,7 @@ void UnitTestPreconditionDirectionsOnline() {
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual(row_prod1, row_prod2, 1.0e-02f);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet2bin/nnet-relabel-egs.cc b/src/nnet2bin/nnet-relabel-egs.cc
index 8cd699c677f..69c6c9923b8 100644
--- a/src/nnet2bin/nnet-relabel-egs.cc
+++ b/src/nnet2bin/nnet-relabel-egs.cc
@@ -22,17 +22,6 @@
 
 #include <sstream>
 
-#ifdef _MSC_VER
-#include <unordered_map>
-using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet2/nnet-example.h"
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 361d9a714a0..a2574deda4a 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -24,7 +24,10 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example-utils.o nnet-training.o \
   nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
   nnet-optimize-utils.o nnet-chain-example.o \
-  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o
+  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+	discriminative-supervision.o nnet-discriminative-example.o \
+	nnet-discriminative-diagnostics.o \
+	discriminative-training.o nnet-discriminative-training.o
 
 LIBNAME = kaldi-nnet3
 
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
new file mode 100644
index 00000000000..f2492beee35
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.cc
@@ -0,0 +1,445 @@
+// nnet3/discriminative-supervision.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+namespace discriminative {
+
+void DiscriminativeSupervisionOptions::Check() const {
+  KALDI_ASSERT(frame_subsampling_factor > 0);
+}
+
+DiscriminativeSupervision::DiscriminativeSupervision(
+    const DiscriminativeSupervision &other):
+    weight(other.weight), num_sequences(other.num_sequences),
+    frames_per_sequence(other.frames_per_sequence), 
+    num_ali(other.num_ali), den_lat(other.den_lat) { }
+
+void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
+  std::swap(weight, other->weight);
+  std::swap(num_sequences, other->num_sequences);
+  std::swap(frames_per_sequence, other->frames_per_sequence);
+  std::swap(num_ali, other->num_ali);
+  std::swap(den_lat, other->den_lat);
+}
+
+bool DiscriminativeSupervision::operator == (
+    const DiscriminativeSupervision &other) const {
+  return ( weight == other.weight && 
+      num_sequences == other.num_sequences &&
+      frames_per_sequence == other.frames_per_sequence &&
+      num_ali == other.num_ali &&
+      fst::Equal(den_lat, other.den_lat) );
+}
+
+void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<DiscriminativeSupervision>");
+  WriteToken(os, binary, "<Weight>");
+  WriteBasicType(os, binary, weight);
+  WriteToken(os, binary, "<NumSequences>");
+  WriteBasicType(os, binary, num_sequences);
+  WriteToken(os, binary, "<FramesPerSeq>");
+  WriteBasicType(os, binary, frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 &&
+               num_sequences > 0);
+  
+  WriteToken(os, binary, "<NumAli>");
+  WriteIntegerVector(os, binary, num_ali);
+
+  WriteToken(os, binary, "<DenLat>");
+  if (!WriteLattice(os, binary, den_lat)) {
+    // We can't return error status from this function so we
+    // throw an exception. 
+    KALDI_ERR << "Error writing denominator lattice to stream";
+  }
+
+  WriteToken(os, binary, "</DiscriminativeSupervision>");
+}
+
+void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<DiscriminativeSupervision>");
+  ExpectToken(is, binary, "<Weight>");
+  ReadBasicType(is, binary, &weight);
+  ExpectToken(is, binary, "<NumSequences>");
+  ReadBasicType(is, binary, &num_sequences);
+  ExpectToken(is, binary, "<FramesPerSeq>");
+  ReadBasicType(is, binary, &frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 && 
+               num_sequences > 0);
+  
+  ExpectToken(is, binary, "<NumAli>");
+  ReadIntegerVector(is, binary, &num_ali);
+
+  ExpectToken(is, binary, "<DenLat>");
+  {
+    Lattice *lat = NULL;
+    if (!ReadLattice(is, binary, &lat) || lat == NULL) {
+      // We can't return error status from this function so we
+      // throw an exception. 
+      KALDI_ERR << "Error reading Lattice from stream";
+    }
+    den_lat = *lat;
+    delete lat;
+    TopSort(&den_lat);
+  }
+
+  ExpectToken(is, binary, "</DiscriminativeSupervision>");
+}
+
+bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
+                                           const Lattice &den_lat, 
+                                           BaseFloat weight) {
+  if (num_ali.size() == 0) return false;
+  if (den_lat.NumStates() == 0) return false;
+
+  this->weight = weight;
+  this->num_sequences = 1;
+  this->frames_per_sequence = num_ali.size();
+  this->num_ali = num_ali;
+  this->den_lat = den_lat;
+  KALDI_ASSERT(TopSort(&(this->den_lat)));
+
+  // Checks if num frames in alignment matches lattice
+  Check();
+
+  return true;
+}
+
+void DiscriminativeSupervision::Check() const {
+  int32 num_frames_subsampled = num_ali.size();
+  KALDI_ASSERT(num_frames_subsampled == 
+               num_sequences * frames_per_sequence);
+
+  {
+    std::vector<int32> state_times;
+    int32 max_time = LatticeStateTimes(den_lat, &state_times);
+    KALDI_ASSERT(max_time == num_frames_subsampled);
+  }
+}
+
+DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
+    const SplitDiscriminativeSupervisionOptions &config,
+    const TransitionModel &tmodel,
+    const DiscriminativeSupervision &supervision):
+    config_(config), tmodel_(tmodel), supervision_(supervision) {
+  if (supervision_.num_sequences != 1) {
+    KALDI_WARN << "Splitting already-reattached sequence (only expected in "
+               << "testing code)";
+  }
+
+  KALDI_ASSERT(supervision_.num_sequences == 1); // For now, don't allow splitting already merged examples
+
+  den_lat_ = supervision_.den_lat;
+  PrepareLattice(&den_lat_, &den_lat_scores_);
+  
+  int32 num_states = den_lat_.NumStates(),
+        num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  KALDI_ASSERT(num_states > 0);
+  int32 start_state = den_lat_.Start();
+  // Lattice should be top-sorted and connected, so start-state must be 0.
+  KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
+  
+  KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
+  KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
+  KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
+}
+
+// Make sure that for any given pdf-id and any given frame, the den-lat has
+// only one transition-id mapping to that pdf-id, on the same frame.
+// It helps us to more completely minimize the lattice.  Note: we
+// can't do this if the criterion is MPFE, because in that case the
+// objective function will be affected by the phone-identities being
+// different even if the pdf-ids are the same.
+void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
+    const std::vector<int32> &state_times, Lattice *lat) const {
+  typedef Lattice::StateId StateId;
+  typedef Lattice::Arc Arc;
+
+  int32 num_frames = state_times.back();   // TODO: Check if this is always true
+  StateId num_states = lat->NumStates();
+
+  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      KALDI_ASSERT(t >= 0 && t < num_frames);
+      Arc arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
+      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
+      if (pdf_to_tid[t].count(pdf) != 0) {
+        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
+        aiter.SetValue(arc);
+      } else {
+        pdf_to_tid[t][pdf] = arc.ilabel;
+      }
+    }
+  }    
+}
+
+void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
+  // Check if all the vectors are of size num_states
+  KALDI_ASSERT(state_times.size() == alpha.size() &&
+               state_times.size() == beta.size());
+
+  // Check that the states are ordered in increasing order of state_times.
+  // This must be true since the states are in breadth-first search order.
+  KALDI_ASSERT(IsSorted(state_times));
+} 
+
+void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize, 
+                                                      DiscriminativeSupervision *out_supervision) const {
+  int32 end_frame = begin_frame + num_frames;
+  // Note: end_frame is not included in the range of frames that the
+  // output supervision object covers; it's one past the end.
+  KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 &&
+               begin_frame + num_frames <=
+               supervision_.num_sequences * supervision_.frames_per_sequence);
+
+  CreateRangeLattice(den_lat_,
+                     den_lat_scores_,
+                     begin_frame, end_frame, normalize,
+                     &(out_supervision->den_lat));
+
+  out_supervision->num_ali.clear();
+  std::copy(supervision_.num_ali.begin() + begin_frame,
+            supervision_.num_ali.begin() + end_frame,
+            std::back_inserter(out_supervision->num_ali));
+  
+  out_supervision->num_sequences = 1;
+  out_supervision->weight = supervision_.weight;
+  out_supervision->frames_per_sequence = num_frames;
+
+  out_supervision->Check();
+}
+
+void DiscriminativeSupervisionSplitter::CreateRangeLattice(
+    const Lattice &in_lat, const LatticeInfo &scores,
+    int32 begin_frame, int32 end_frame, bool normalize,
+    Lattice *out_lat) const {
+  typedef Lattice::StateId StateId;
+
+  const std::vector<int32> &state_times = scores.state_times;
+  
+  // Some checks to ensure the lattice and scores are prepared properly 
+  KALDI_ASSERT(state_times.size() == in_lat.NumStates());
+  if (!in_lat.Properties(fst::kTopSorted, true))
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+
+  std::vector<int32>::const_iterator begin_iter =
+      std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
+      end_iter = std::lower_bound(begin_iter, 
+                                  state_times.end(), end_frame);
+
+  KALDI_ASSERT(*begin_iter == begin_frame &&
+               (begin_iter == state_times.begin() || 
+                begin_iter[-1] < begin_frame));
+  // even if end_frame == supervision_.num_frames, there should be a state with
+  // that frame index.
+  KALDI_ASSERT(end_iter[-1] < end_frame &&
+               (end_iter < state_times.end() || *end_iter == end_frame));
+  StateId begin_state = begin_iter - state_times.begin(),
+          end_state = end_iter - state_times.begin();
+
+  KALDI_ASSERT(end_state > begin_state);
+  out_lat->DeleteStates();
+  out_lat->ReserveStates(end_state - begin_state + 2);
+
+  // Add special start state
+  StateId start_state = out_lat->AddState();
+  out_lat->SetStart(start_state);
+  
+  for (StateId i = begin_state; i < end_state; i++)
+    out_lat->AddState();
+  
+  // Add the special final-state.
+  StateId final_state = out_lat->AddState();
+  out_lat->SetFinal(final_state, LatticeWeight::One());
+
+  for (StateId state = begin_state; state < end_state; state++) {
+    StateId output_state = state - begin_state + 1;
+    if (state_times[state] == begin_frame) {
+      // we'd like to make this an initial state, but OpenFst doesn't allow
+      // multiple initial states.  Instead we add an epsilon transition to it
+      // from our actual initial state.  The weight on this 
+      // transition is the forward probability of the said 'initial state'
+      LatticeWeight weight = LatticeWeight::One();
+      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]); 
+      // Add negative of the forward log-probability to the graph cost score,
+      // since the acoustic scores would be changed later.
+      // Assuming that the lattice is scaled with appropriate acoustic
+      // scale.
+      // We additionally normalize using the total lattice score. Since the
+      // same score is added as normalizer to all the paths in the lattice,
+      // the relative probabilities of the paths in the lattice is not affected.
+      // Note: Doing a forward-backward on this split must result in a total
+      // score of 0 because of the normalization.
+
+      out_lat->AddArc(start_state, 
+                      LatticeArc(0, 0, weight, output_state));
+    } else {
+      KALDI_ASSERT(scores.state_times[state] < end_frame);
+    }
+    for (fst::ArcIterator<Lattice> aiter(in_lat, state); 
+          !aiter.Done(); aiter.Next()) {
+      const LatticeArc &arc = aiter.Value();
+      StateId nextstate = arc.nextstate;
+      if (nextstate >= end_state) {
+        // A transition to any state outside the range becomes a transition to
+        // our special final-state. 
+        // The weight is just the negative of the backward log-probability + 
+        // the arc cost. We again normalize with the total lattice score.
+        LatticeWeight weight;
+        //KALDI_ASSERT(scores.beta[state] < 0);
+        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]); 
+        weight.SetValue2(arc.weight.Value2());
+        // Add negative of the backward log-probability to the LM score, since
+        // the acoustic scores would be changed later.
+        // Note: We don't normalize here because that is already done with the
+        // initial cost.
+      
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
+      } else {
+        StateId output_nextstate = nextstate - begin_state + 1;
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate));
+      }
+    }
+  }
+
+  // Get rid of the word labels and put the
+  // transition-ids on both sides.
+  fst::Project(out_lat, fst::PROJECT_INPUT);
+  fst::RmEpsilon(out_lat);
+
+  if (config_.collapse_transition_ids)
+    CollapseTransitionIds(state_times, out_lat);
+
+  if (config_.determinize) {
+    if (!config_.minimize) {
+      Lattice tmp_lat;
+      fst::Determinize(*out_lat, &tmp_lat);
+      std::swap(*out_lat, tmp_lat);
+    } else {
+      Lattice tmp_lat;
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::RmEpsilon(out_lat);
+    }
+  }
+
+  fst::TopSort(out_lat);    
+  std::vector<int32> state_times_tmp;
+  KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) ==
+                                            end_frame - begin_frame);
+
+  // Remove the acoustic scale that was previously added
+  if (config_.supervision_config.acoustic_scale != 1.0) {
+    fst::ScaleLattice(fst::AcousticLatticeScale(
+          1 / config_.supervision_config.acoustic_scale), out_lat);
+  }
+}
+
+void DiscriminativeSupervisionSplitter::PrepareLattice(
+    Lattice *lat, LatticeInfo *scores) const {
+  // Scale the lattice to appropriate acoustic scale. It is important to 
+  // ensure this is equal to the acoustic scale used while training. This is 
+  // because, on splitting lattices, the initial and final costs are added 
+  // into the graph cost.
+  KALDI_ASSERT(config_.supervision_config.acoustic_scale != 0.0);
+  if (config_.supervision_config.acoustic_scale != 1.0)
+    fst::ScaleLattice(fst::AcousticLatticeScale(
+          config_.supervision_config.acoustic_scale), lat);
+
+  LatticeStateTimes(*lat, &(scores->state_times));
+  int32 num_states = lat->NumStates();
+  std::vector<std::pair<int32,int32> > state_time_indexes(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    state_time_indexes[s] = std::make_pair(scores->state_times[s], s);
+  }
+
+  // Order the states based on the state times. This is stronger than just
+  // topological sort. This is required by the lattice splitting code.
+  std::sort(state_time_indexes.begin(), state_time_indexes.end());
+  
+  std::vector<int32> state_order(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    state_order[state_time_indexes[s].second] = s;
+  }
+
+  fst::StateSort(lat, state_order);
+  ComputeLatticeScores(*lat, scores);
+}
+
+void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
+    LatticeInfo *scores) const {
+  LatticeStateTimes(lat, &(scores->state_times));
+  ComputeLatticeAlphasAndBetas(lat, false, 
+                               &(scores->alpha), &(scores->beta));
+  scores->Check();  
+  // This check will fail if the lattice is not breadth-first search sorted
+}
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+    bool compactify,
+    std::vector<DiscriminativeSupervision> *output_supervision) {
+  KALDI_ASSERT(!input.empty());
+  int32 num_inputs = input.size();
+  if (num_inputs == 1) {
+    output_supervision->resize(1);
+    (*output_supervision)[0] = *(input[0]);
+    return;
+  }
+  output_supervision->clear();
+  output_supervision->reserve(input.size());
+  for (int32 i = 0; i < input.size(); i++) {
+    const DiscriminativeSupervision &src = *(input[i]);
+    KALDI_ASSERT(src.num_sequences == 1);
+    if (compactify && !output_supervision->empty() &&
+        output_supervision->back().weight == src.weight &&
+        output_supervision->back().frames_per_sequence ==
+        src.frames_per_sequence) {
+      // Combine with current output
+      // append src.den_lat to output_supervision->den_lat.
+      fst::Concat(&output_supervision->back().den_lat, src.den_lat);
+
+      output_supervision->back().num_ali.insert(
+          output_supervision->back().num_ali.end(), 
+          src.num_ali.begin(), src.num_ali.end());
+
+      output_supervision->back().num_sequences++;
+    } else {
+      output_supervision->resize(output_supervision->size() + 1);
+      output_supervision->back() = src;
+    }
+
+    if (compactify)
+      fst::TopSort(&output_supervision->back().den_lat);
+    output_supervision->back().Check();
+  }
+}
+
+} // namespace discriminative 
+} // namespace kaldi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
new file mode 100644
index 00000000000..c5cdc7a4107
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.h
@@ -0,0 +1,251 @@
+// nnet3/discriminative-supervision.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+#define KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+
+#include "util/table-types.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+namespace discriminative {
+
+struct DiscriminativeSupervisionOptions {
+  int32 frame_subsampling_factor;
+  BaseFloat acoustic_scale;
+
+  DiscriminativeSupervisionOptions(): frame_subsampling_factor(1), acoustic_scale(0.1) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                   "if the frame-rate for the model will be less than the "
+                   "frame-rate of the original alignment.  Applied after "
+                   "left-tolerance and right-tolerance are applied (so they are "
+                   "in terms of the original num-frames.");
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods");
+  }
+
+  void Check() const;
+};
+
+struct SplitDiscriminativeSupervisionOptions {
+  bool remove_output_symbols;
+  bool collapse_transition_ids;
+  bool remove_epsilons;
+  bool determinize;
+  bool minimize; // we'll push and minimize if this is true.
+  DiscriminativeSupervisionOptions supervision_config;
+  
+  SplitDiscriminativeSupervisionOptions() :
+    remove_output_symbols(false), collapse_transition_ids(false), 
+    remove_epsilons(false), determinize(false),
+    minimize(false) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("collapse-transition-ids", &collapse_transition_ids,
+                   "If true, modify the transition-ids on denominator lattice "
+                   "so that on each frame, there is just one with any given "
+                   "pdf-id. This allows us to determinize and minimize "
+                   "more completely.");
+    opts->Register("remove-output-symbols", &remove_output_symbols,
+                   "Remove output symbols from lattice to convert it to an "
+                   "acceptor and make it more determinizable");
+    opts->Register("remove-epsilons", &remove_epsilons,
+                   "Remove epsilons from the split lattices");
+    opts->Register("determinize", &determinize, "If true, we determinize "
+                   "lattices (as Lattice) after splitting and possibly minimize");
+    opts->Register("minimize", &minimize, "If true, we push and "
+                   "minimize lattices (as Lattice) after splitting");
+    supervision_config.Register(opts);
+  }
+};
+
+/*
+  This file contains some declarations relating to the object we use to
+  encode the supervision information for sequence training
+*/
+
+// struct DiscriminativeSupervision is the fully-processed information for
+// a whole utterance or (after splitting) part of an utterance. 
+struct DiscriminativeSupervision {
+  // The weight we assign to this example;
+  // this will typically be one, but we include it
+  // for the sake of generality.  
+  BaseFloat weight; 
+  
+  // num_sequences will be 1 if you create a DiscriminativeSupervision object from a single
+  // lattice or alignment, but if you combine multiple DiscriminativeSupervision objects
+  // the 'num_sequences' is the number of objects that were combined (the
+  // lattices get appended).
+  int32 num_sequences;
+
+  // the number of frames in each sequence of appended objects.  num_frames *
+  // num_sequences must equal the path length of any path in the lattices.
+  // Technically this information is redundant with the lattices, but it's convenient
+  // to have it separately.
+  int32 frames_per_sequence;
+  
+  // The numerator alignment
+  // Usually obtained by aligning the reference text with the seed neural
+  // network model; can be the best path of generated lattice in the case of
+  // semi-supervised training.
+  std::vector<int32> num_ali;
+  
+  // Note: any acoustic
+  // likelihoods in the lattices will be
+  // recomputed at the time we train.
+  
+  // The denominator lattice.  
+  Lattice den_lat; 
+  
+  DiscriminativeSupervision(): weight(1.0), num_sequences(1),
+                               frames_per_sequence(-1) { }
+
+  DiscriminativeSupervision(const DiscriminativeSupervision &other);
+
+
+  // This function creates a supervision object from numerator alignment
+  // and denominator lattice.  The supervision object is used for sequence
+  // discriminative training.
+  // Topologically sorts the lattice after copying to the supervision object.
+  // Returns false when alignment or lattice is empty 
+  bool Initialize(const std::vector<int32> &alignment,
+                  const Lattice &lat,
+                  BaseFloat weight);
+
+  void Swap(DiscriminativeSupervision *other);
+
+  bool operator == (const DiscriminativeSupervision &other) const;
+  
+  // This function checks that this supervision object satifsies some
+  // of the properties we expect of it, and calls KALDI_ERR if not.
+  void Check() const;
+  
+  inline int32 NumFrames() const { 
+    return num_sequences * frames_per_sequence; 
+  }
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+};
+
+// This class is used for splitting something of type
+// DiscriminativeSupervision into
+// multiple pieces corresponding to different frame-ranges.
+class DiscriminativeSupervisionSplitter {
+ public:
+  typedef fst::ArcTpl<LatticeWeight> LatticeArc;
+  typedef fst::VectorFst<LatticeArc> Lattice;
+ 
+  DiscriminativeSupervisionSplitter(
+      const SplitDiscriminativeSupervisionOptions &config,
+      const TransitionModel &tmodel,
+      const DiscriminativeSupervision &supervision);
+
+  // A structure used to store the forward and backward scores 
+  // and state times of a lattice
+  struct LatticeInfo {
+    // These values are stored in log. 
+    std::vector<double> alpha;
+    std::vector<double> beta;
+    std::vector<int32> state_times;
+
+    void Check() const;
+  };
+  
+  // Extracts a frame range of the supervision into 'supervision'.  
+  void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
+                     bool normalize,
+                     DiscriminativeSupervision *supervision) const;
+
+  // Get the acoustic scaled denominator lattice out for debugging purposes
+  inline const Lattice& DenLat() const { return den_lat_; }  
+
+ private:
+
+  // Creates an output lattice covering frames begin_frame <= t < end_frame,
+  // assuming that the corresponding state-range that we need to
+  // include, begin_state <= s < end_state has been included.
+  // (note: the output lattice will also have two special initial and final
+  // states).  
+  // Also does post-processing (RmEpsilon, Determinize,
+  // TopSort on the result).  See code for details.
+  void CreateRangeLattice(const Lattice &in_lat,
+                          const LatticeInfo &scores,
+                          int32 begin_frame, int32 end_frame, bool normalize,
+                          Lattice *out_lat) const;
+
+  // Config options for splitting supervision object
+  const SplitDiscriminativeSupervisionOptions &config_;
+
+  // Transition model is used by the function
+  // CollapseTransitionIds()
+  const TransitionModel &tmodel_;
+  
+  // A reference to the supervision object that we will be splitting
+  const DiscriminativeSupervision &supervision_;
+
+  // LatticeInfo object for denominator lattice.
+  // This will be computed when PrepareLattice function is called.
+  LatticeInfo den_lat_scores_;
+
+  // Copy of denominator lattice. This is required because the lattice states
+  // need to be ordered in breadth-first search order.
+  Lattice den_lat_;
+
+  // Function to compute lattice scores for a lattice
+  void ComputeLatticeScores(const Lattice &lat, LatticeInfo *scores) const;
+
+  // Prepare lattice : 
+  // 1) Order states in breadth-first search order
+  // 2) Compute states times, which must be a strictly non-decreasing vector
+  // 3) Compute lattice alpha and beta scores
+  void PrepareLattice(Lattice *lat, LatticeInfo *scores) const;
+
+  // Modifies the transition-ids on lat_ so that on each frame, there is just
+  // one with any given pdf-id.  This allows us to determinize and minimize
+  // more completely.
+  void CollapseTransitionIds(const std::vector<int32> &state_times, 
+                             Lattice *lat) const;
+
+};
+
+/// This function appends a list of supervision objects to create what will
+/// usually be a single such object, but if the weights and num-frames are not
+/// all the same it will only append Supervision objects where successive ones
+/// have the same weight and num-frames, and if 'compactify' is true.  The
+/// normal use-case for this is when you are combining neural-net examples for
+/// training; appending them like this helps to simplify the training process.
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+    bool compactify,
+    std::vector<DiscriminativeSupervision> *output_supervision);
+
+typedef TableWriter<KaldiObjectHolder<DiscriminativeSupervision> > DiscriminativeSupervisionWriter;
+typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeSupervision> > SequentialDiscriminativeSupervisionReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeSupervision> > RandomAccessDiscriminativeSupervisionReader;
+
+} // namespace discriminative
+} // namespace kaldi
+
+#endif // KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
new file mode 100644
index 00000000000..bc9003ed988
--- /dev/null
+++ b/src/nnet3/discriminative-training.cc
@@ -0,0 +1,645 @@
+// nnet3/discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-training.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+namespace discriminative {
+
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo() {
+  std::memset(this, 0, sizeof(*this));
+}
+
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo(int32 num_pdfs) :
+  accumulate_gradients(false),
+  accumulate_output(false),
+  num_pdfs(num_pdfs) {
+  gradients.Resize(num_pdfs); 
+  output.Resize(num_pdfs);
+  Reset();
+}
+
+// Constructor from config structure
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo(
+    const DiscriminativeOptions &opts) : 
+  accumulate_gradients(opts.accumulate_gradients),
+  accumulate_output(opts.accumulate_output),
+  num_pdfs(opts.num_pdfs) {
+  gradients.Resize(opts.num_pdfs); 
+  output.Resize(opts.num_pdfs);
+  Reset();
+}
+
+// Reset statistics
+void DiscriminativeObjectiveInfo::Reset() {
+  gradients.SetZero();
+  output.SetZero();
+
+  tot_t = 0.0;
+  tot_t_weighted = 0.0;
+  tot_objf = 0.0;
+  tot_num_count = 0.0;
+  tot_den_count = 0.0;
+  tot_num_objf = 0.0;
+  tot_l2_term = 0.0;
+}
+
+void DiscriminativeObjectiveInfo::Configure(const DiscriminativeOptions &opts) {
+  accumulate_gradients = opts.accumulate_gradients;
+  accumulate_output = opts.accumulate_output;
+  num_pdfs = opts.num_pdfs;
+  gradients.Resize(opts.num_pdfs); 
+  output.Resize(opts.num_pdfs);
+}
+
+// This class is responsible for the forward-backward of the
+// 'supervision' lattices and computation of the objective function
+// and gradients.
+//
+// note: the supervision.weight is ignored by this class, you have to apply
+// it externally.
+class DiscriminativeComputation {
+  typedef Lattice::Arc Arc;
+  typedef Arc::StateId StateId;
+ 
+ public:
+  // Initialize the objcect.  Note: we expect the 'nnet_output' to have the
+  // same number of rows as supervision.num_frames * supervision.num_sequences,
+  // and the same number of columns as tmodel.NumPdfs(); but the
+  // ordering of the rows of 'nnet_output' is not the same as the ordering of
+  // frames in paths in the 'supervision' object (which has all frames of the
+  // 1st sequence first, then the 2nd sequence, and so on).  Instead, the
+  // frames in 'nnet_output' are ordered as: first the first frame of each
+  // sequence, then the second frame of each sequence, and so on. 
+  // This is done to be similar to the setup in 'chain' training 
+  // even though this does not offer any computational advantages here
+  // as in the 'chain' case.
+  DiscriminativeComputation(const DiscriminativeOptions &opts,
+      const TransitionModel &tmodel,
+      const CuVectorBase<BaseFloat> &log_priors,
+      const DiscriminativeSupervision &supervision,
+      const CuMatrixBase<BaseFloat> &nnet_output,
+      DiscriminativeObjectiveInfo *stats,
+      CuMatrixBase<BaseFloat> *nnet_output_deriv,
+      CuMatrixBase<BaseFloat> *xent_output_deriv);
+  
+  // Does the forward-backward computation and add the derivative of the
+  // w.r.t. the nnet output (log-prob) times supervision_.weight times
+  // deriv_weight to 'nnet_output_deriv'.
+  void Compute();
+ 
+ private:
+  const DiscriminativeOptions &opts_;
+  const TransitionModel &tmodel_;
+  
+  // Vector of log-priors of pdfs. 
+  // This can be a size zero vector e.g. for 'chain' model
+  const CuVectorBase<BaseFloat> &log_priors_;
+
+  const DiscriminativeSupervision &supervision_;
+  
+  // The neural net output.
+  const CuMatrixBase<BaseFloat> &nnet_output_;
+
+  // Training stats including accumulated objective function, gradient
+  // and total weight. Optionally the nnet_output and gradients per pdf can be
+  // accumulated for debugging purposes.
+  DiscriminativeObjectiveInfo *stats_;
+
+  // If non-NULL, derivative w.r.t. to nnet_output is written here.
+  CuMatrixBase<BaseFloat> *nnet_output_deriv_;
+   
+  // If non-NULL, then the xent objective derivative 
+  // (which equals a posterior from the numerator forward-backward, scaled by
+  // the supervision weight) is written to here.  
+  // This will be used in the cross-entropy regularization code.  
+  CuMatrixBase<BaseFloat> *xent_output_deriv_;
+
+  // Denominator lattice. 
+  Lattice den_lat_;
+
+  // List of silence phones. Useful to treat silence phones
+  // differently in computing SMBR / MPFE objectives.
+  std::vector<int32> silence_phones_;
+
+  // The function that actually computes the objective and gradients
+  double ComputeObjfAndDeriv(Posterior *post, Posterior *xent_post);
+
+  // This function looks up the nnet output the pdf-ids in the 
+  // denominator lattice and the alignment in the case of "mmi" objective
+  // using the CuMatrix::Lookup() and stores them in "answers"
+  void LookupNnetOutput(std::vector<Int32Pair> *requested_indexes,
+                        std::vector<BaseFloat> *answers) const ;
+  
+  // Converts the answers looked up by LookupNnetOutput function into 
+  // log-likelihoods scaled by acoustic scale.
+  void ConvertAnswersToLogLike(
+      const std::vector<Int32Pair>& requested_indexes,
+      std::vector<BaseFloat> *answers) const;
+  
+  // Does acoustic rescoring of lattice to put the negative (scaled) acoustic
+  // log-likelihoods in the arcs of the lattice. Returns the number of 
+  // indexes of log-likelihoods read from the "answers" vector.
+  static size_t LatticeAcousticRescore(const std::vector<BaseFloat> &answers,
+                                size_t index,
+                                Lattice *lat);
+
+  // Process the derivative stored as posteriors into CuMatrix.
+  // Optionally accumulate numerator and denominator posteriors.
+  void ProcessPosteriors(const Posterior &post, 
+                         CuMatrixBase<BaseFloat> *output_deriv_temp,
+                         double *tot_num_post = NULL,
+                         double *tot_den_post = NULL) const;
+
+  static inline Int32Pair MakePair(int32 first, int32 second) {
+    Int32Pair ans;
+    ans.first = first;
+    ans.second = second;
+    return ans;
+  }
+};
+
+DiscriminativeComputation::DiscriminativeComputation(
+                            const DiscriminativeOptions &opts,
+                            const TransitionModel &tmodel,
+                            const CuVectorBase<BaseFloat> &log_priors,
+                            const DiscriminativeSupervision &supervision,
+                            const CuMatrixBase<BaseFloat> &nnet_output,
+                            DiscriminativeObjectiveInfo *stats,
+                            CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                            CuMatrixBase<BaseFloat> *xent_output_deriv)
+  : opts_(opts), tmodel_(tmodel), log_priors_(log_priors), 
+  supervision_(supervision), nnet_output_(nnet_output),
+  stats_(stats), 
+  nnet_output_deriv_(nnet_output_deriv), 
+  xent_output_deriv_(xent_output_deriv) {
+  
+  den_lat_ = supervision.den_lat;
+  TopSort(&den_lat_);
+  
+  if (!SplitStringToIntegers(opts_.silence_phones_str, ":", false,
+                             &silence_phones_)) {
+    KALDI_ERR << "Bad value for --silence-phones option: "
+              << opts_.silence_phones_str;
+  }
+}
+
+void DiscriminativeComputation::LookupNnetOutput(
+    std::vector<Int32Pair> *requested_indexes,
+    std::vector<BaseFloat> *answers) const {
+  BaseFloat wiggle_room = 1.3; // value not critical.. it's just 'reserve'
+  
+  int32 num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  int32 num_pdfs = tmodel_.NumPdfs();
+
+  int32 num_reserve = wiggle_room * den_lat_.NumStates();
+  
+  if (opts_.criterion == "mmi") {
+    // For looking up the posteriors corresponding to the pdfs in the alignment
+    num_reserve += num_frames;
+  } 
+
+  requested_indexes->reserve(num_reserve);
+  
+  // Denominator probabilities to look up from denominator lattice
+  std::vector<int32> state_times;
+  int32 T = LatticeStateTimes(den_lat_, &state_times);
+  KALDI_ASSERT(T == num_frames);
+  
+  StateId num_states = den_lat_.NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    int32 seq = t / supervision_.frames_per_sequence, 
+          idx = t % supervision_.frames_per_sequence;
+
+    for (fst::ArcIterator<Lattice> aiter(den_lat_, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+        // The ordering of the indexes is similar to that in chain models
+        requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+      }
+    }
+  }
+
+  if (opts_.criterion == "mmi") {
+    // Numerator probabilities to look up from alignment
+    for (int32 t = 0; t < num_frames; t++) {
+      int32 seq = t / supervision_.frames_per_sequence, 
+            idx = t % supervision_.frames_per_sequence;
+      int32 tid = supervision_.num_ali[t], 
+                  pdf_id = tmodel_.TransitionIdToPdf(tid);
+      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
+      requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+    }
+  } 
+  
+  CuArray<Int32Pair> cu_requested_indexes(*requested_indexes);
+  answers->resize(requested_indexes->size());
+  nnet_output_.Lookup(cu_requested_indexes, &((*answers)[0]));
+  // requested_indexes now contain (t, j) pair and answers contains the 
+  // neural network output, which is log p(j|x(t)) for CE models
+}
+
+void DiscriminativeComputation::ConvertAnswersToLogLike(
+    const std::vector<Int32Pair>& requested_indexes,
+    std::vector<BaseFloat> *answers) const {
+  int32 num_floored = 0;
+
+  BaseFloat floor_val = -20 * kaldi::Log(10.0); // floor for posteriors.
+  size_t index;
+
+  Vector<BaseFloat> log_priors(log_priors_);
+  
+  // Replace "answers" with the vector of scaled log-probs.  If this step takes
+  // too much time, we can look at other ways to do it, using the CUDA card.
+  for (index = 0; index < answers->size(); index++) {
+    BaseFloat log_post = (*answers)[index];
+    if (log_post < floor_val) {
+      // TODO: this might not be required for 'chain' models
+      log_post = floor_val;
+      num_floored++;
+    }
+
+    if (log_priors_.Dim() > 0) {
+      int32 pdf_id = requested_indexes[index].second;
+      KALDI_ASSERT(log_post <= 0 && log_priors(pdf_id) <= 0);
+      BaseFloat pseudo_loglike = (log_post - log_priors(pdf_id)) 
+                                  * opts_.acoustic_scale;
+      KALDI_ASSERT(!KALDI_ISINF(pseudo_loglike) && !KALDI_ISNAN(pseudo_loglike));
+      (*answers)[index] = pseudo_loglike;
+    } else {
+      (*answers)[index] = log_post * opts_.acoustic_scale;
+    }
+  }
+  
+  if (num_floored > 0) {
+    KALDI_WARN << "Floored " << num_floored << " probabilities from nnet.";
+  }
+}
+
+size_t DiscriminativeComputation::LatticeAcousticRescore(
+    const std::vector<BaseFloat> &answers,
+    size_t index, Lattice *lat) {
+  int32 num_states = lat->NumStates();
+  
+  for (StateId s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        arc.weight.SetValue2(-answers[index]);
+        index++;
+        aiter.SetValue(arc);
+      }
+    }
+    LatticeWeight final = lat->Final(s);
+    if (final != LatticeWeight::Zero()) {
+      final.SetValue2(0.0); // make sure no acoustic term in final-prob.
+      lat->SetFinal(s, final);
+    }
+  }
+
+  // Number of indexes of log-likes used to rescore lattice
+  return index; 
+}
+
+void DiscriminativeComputation::ProcessPosteriors(
+                                const Posterior &post, 
+                                CuMatrixBase<BaseFloat> *output_deriv_temp,
+                                double *tot_num_post,
+                                double *tot_den_post) const {
+  std::vector<Int32Pair> deriv_indexes;
+  std::vector<BaseFloat> deriv_data;
+  for (size_t t = 0; t < post.size(); t++) {
+    for (size_t j = 0; j < post[t].size(); j++) {
+      int32 seq = t / supervision_.frames_per_sequence, 
+            idx = t % supervision_.frames_per_sequence;
+      int32 pdf_id = post[t][j].first;
+
+      // Same ordering as for 'chain' models
+      deriv_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+
+      BaseFloat weight = post[t][j].second;
+      if (tot_num_post && weight > 0.0) *tot_num_post += weight;
+      if (tot_den_post && weight < 0.0) *tot_den_post -= weight;
+      deriv_data.push_back(weight);
+    }
+  }
+  CuArray<Int32Pair> cu_deriv_indexes(deriv_indexes);
+  output_deriv_temp->AddElements(supervision_.weight, cu_deriv_indexes, 
+                                 deriv_data.data());
+}
+
+void DiscriminativeComputation::Compute() {
+  if (opts_.criterion == "mmi" && opts_.boost != 0.0) {
+    BaseFloat max_silence_error = 0.0;
+    LatticeBoost(tmodel_, supervision_.num_ali, silence_phones_,
+                 opts_.boost, max_silence_error, &den_lat_);
+  }
+
+  int32 num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  
+  int32 num_pdfs = nnet_output_.NumCols();
+  KALDI_ASSERT(log_priors_.Dim() == 0 || num_pdfs == log_priors_.Dim());
+  
+  // We need to look up the nnet output for some pdf-ids.
+  // Rather than looking them all up using operator (), which is
+  // very slow because each lookup involves a separate CUDA call with
+  // communication over PciExpress, we look them up all at once using
+  // CuMatrix::Lookup().
+  std::vector<BaseFloat> answers;
+  std::vector<Int32Pair> requested_indexes;
+  
+  LookupNnetOutput(&requested_indexes, &answers);
+  
+  ConvertAnswersToLogLike(requested_indexes, &answers);
+
+  size_t index = 0;
+  
+  // Now put the negative (scaled) acoustic log-likelihoods in the lattice.
+  index = LatticeAcousticRescore(answers, index, &den_lat_);
+  // index is now the number of indexes of log-likes used to rescore lattice.
+  // This is required to further lookup answers for computing "mmi" 
+  // numerator score.
+
+  // Get statistics for this minibatch
+  DiscriminativeObjectiveInfo this_stats;
+  if (stats_) {
+    this_stats = *stats_;
+    this_stats.Reset();
+  }
+  
+  // Look up numerator probabilities corresponding to alignment
+  if (opts_.criterion == "mmi") {
+    double tot_num_like = 0.0;
+    KALDI_ASSERT(index + supervision_.num_ali.size() == answers.size());
+    for (size_t this_index = 0; this_index < supervision_.num_ali.size(); this_index++) {
+      tot_num_like += answers[index + this_index];
+    }
+    this_stats.tot_num_objf += supervision_.weight * tot_num_like;
+    index += supervision_.num_ali.size();
+  } 
+
+  KALDI_ASSERT(index == answers.size());
+  
+  if (nnet_output_deriv_) {
+    nnet_output_deriv_->SetZero();
+    KALDI_ASSERT(nnet_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        nnet_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  if (xent_output_deriv_) {
+    xent_output_deriv_->SetZero();
+    KALDI_ASSERT(xent_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        xent_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  Posterior post;
+  Posterior xent_post;
+  double objf = ComputeObjfAndDeriv(&post, 
+                (xent_output_deriv_ ? &xent_post : NULL));
+  
+  this_stats.tot_objf += supervision_.weight * objf;
+  
+  KALDI_ASSERT(nnet_output_.NumRows() == post.size());
+  
+  CuMatrix<BaseFloat> output_deriv;
+  
+  CuMatrixBase<BaseFloat> *output_deriv_temp; 
+  
+  if (nnet_output_deriv_) 
+    output_deriv_temp = nnet_output_deriv_;
+  else {
+    // This is for accumulating the statistics
+    output_deriv.Resize(nnet_output_.NumRows(), nnet_output_.NumCols());
+    output_deriv_temp = &output_deriv;
+  }
+  
+  double tot_num_post = 0.0, tot_den_post = 0.0;
+  {
+    ProcessPosteriors(post, output_deriv_temp, 
+                             &tot_num_post, &tot_den_post);
+  }
+
+  if (xent_output_deriv_) {
+    ProcessPosteriors(xent_post, xent_output_deriv_, NULL, NULL);
+  }
+                        
+  this_stats.tot_den_count += tot_den_post;
+  this_stats.tot_num_count += tot_num_post;
+
+  if (this_stats.AccumulateGradients()) 
+    (this_stats.gradients).AddRowSumMat(1.0, CuMatrix<double>(*output_deriv_temp));
+  
+  if (this_stats.AccumulateOutput()) {
+    CuMatrix<double> temp(nnet_output_);
+    temp.ApplyExp();
+    (this_stats.output).AddRowSumMat(1.0, temp);
+  }
+  
+  this_stats.tot_t = num_frames;
+  this_stats.tot_t_weighted = num_frames * supervision_.weight;
+  
+  if (!(this_stats.TotalObjf(opts_.criterion) == 
+        this_stats.TotalObjf(opts_.criterion))) {
+    // inf or NaN detected
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->SetZero();
+    BaseFloat default_objf = -10;
+    KALDI_WARN << "Objective function is " 
+               << this_stats.TotalObjf(opts_.criterion)
+               << ", setting to " << default_objf << " per frame.";
+    this_stats.tot_objf = default_objf * this_stats.tot_t_weighted;
+  }
+  
+  if (GetVerboseLevel() >= 2) {
+    if (GetVerboseLevel() >= 3) {
+      this_stats.PrintAll(opts_.criterion);
+    } else 
+      this_stats.Print(opts_.criterion);
+  }
+
+  // This code helps us see how big the derivatives are, on average,
+  // for different frames of the sequences.  As expected, they are
+  // smaller towards the edges of the sequences (due to the penalization
+  // of 'incorrect' pdf-ids.
+  if (nnet_output_deriv_ && GetVerboseLevel() >= 1) {
+    int32 tot_frames = nnet_output_deriv_->NumRows(),
+ frames_per_sequence = supervision_.frames_per_sequence,
+       num_sequences = supervision_.num_sequences;
+    CuVector<BaseFloat> row_products(tot_frames);
+    row_products.AddDiagMat2(1.0, *nnet_output_deriv_, kNoTrans, 0.0);
+    Vector<BaseFloat> row_products_cpu(row_products);
+    Vector<BaseFloat> row_products_per_frame(frames_per_sequence);
+    for (int32 i = 0; i < tot_frames; i++)
+      row_products_per_frame(i / num_sequences) += row_products_cpu(i);
+    KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
+  }
+  
+  if (opts_.l2_regularize != 0.0) {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision_.weight * opts_.l2_regularize;
+    this_stats.tot_l2_term += -0.5 * scale * TraceMatMat(nnet_output_, nnet_output_, kTrans);
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->AddMat(-1.0 * scale, nnet_output_);
+  }
+  
+  if (stats_)
+    stats_->Add(this_stats);
+
+}
+
+double DiscriminativeComputation::ComputeObjfAndDeriv(Posterior *post, 
+                                                      Posterior *xent_post) {
+
+  if (xent_post) {
+    Posterior tid_post;
+    // Compute posterior from the numerator alignment
+    AlignmentToPosterior(supervision_.num_ali, &tid_post);
+    ConvertPosteriorToPdfs(tmodel_, tid_post, xent_post);
+  }
+
+  if (opts_.criterion == "mpfe" || opts_.criterion == "smbr") {
+    Posterior tid_post;
+    double ans = LatticeForwardBackwardMpeVariants(tmodel_, silence_phones_, 
+        den_lat_,
+        supervision_.num_ali, opts_.criterion,
+        opts_.one_silence_class,
+        &tid_post);
+    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
+    return ans;
+  } else if (opts_.criterion == "mmi") {
+    bool convert_to_pdfs = true, cancel = true;
+    // we'll return the denominator-lattice forward backward likelihood,
+    // which is one term in the objective function.
+    return (LatticeForwardBackwardMmi(tmodel_, den_lat_, supervision_.num_ali,
+                                      opts_.drop_frames, convert_to_pdfs,
+                                      cancel, post));
+  } else {
+    KALDI_ERR << "Unknown criterion " << opts_.criterion;
+  }
+
+  return 0;
+}
+
+
+void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeOptions &opts,
+                                       const TransitionModel &tmodel,
+                                       const CuVectorBase<BaseFloat> &log_priors,
+                                       const DiscriminativeSupervision &supervision,
+                                       const CuMatrixBase<BaseFloat> &nnet_output,
+                                       DiscriminativeObjectiveInfo *stats,
+                                       CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                                       CuMatrixBase<BaseFloat> *xent_output_deriv) {
+  DiscriminativeComputation computation(opts, tmodel, log_priors, supervision, 
+                                        nnet_output, stats, 
+                                        nnet_output_deriv, xent_output_deriv);
+  computation.Compute();
+}
+
+void DiscriminativeObjectiveInfo::Add(const DiscriminativeObjectiveInfo &other) {
+  tot_t += other.tot_t;
+  tot_t_weighted += other.tot_t_weighted;
+  tot_objf += other.tot_objf;             // Actually tot_den_objf for mmi
+  tot_num_count += other.tot_num_count;   
+  tot_den_count += other.tot_den_count;   
+  tot_num_objf += other.tot_num_objf;     // Only for mmi
+  tot_l2_term += other.tot_l2_term;
+  
+  if (AccumulateGradients()) {
+    gradients.AddVec(1.0, other.gradients);
+  } 
+  if (AccumulateOutput()) {
+    output.AddVec(1.0, other.output);
+  }
+}
+
+void DiscriminativeObjectiveInfo::Print(const std::string &criterion, 
+                                        bool print_avg_gradients, 
+                                        bool print_avg_output) const {
+  if (criterion == "mmi") {
+    double num_objf = tot_num_objf / tot_t_weighted,
+           den_objf = tot_objf / tot_t_weighted;
+    double objf = num_objf - den_objf;
+
+    double avg_post_per_frame = tot_num_count / tot_t_weighted;
+
+    KALDI_LOG << "Number of frames is " << tot_t
+              << " (weighted: " << tot_t_weighted
+              << "), average (num or den) posterior per frame is "
+              << avg_post_per_frame;
+    KALDI_LOG << "MMI objective function is " << num_objf << " - "
+              << den_objf << " = " << objf << " per frame, over "
+              << tot_t_weighted << " frames.";
+  } else if (criterion == "mpfe") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of MPFE gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "MPFE objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  } else if (criterion == "smbr") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of SMBR gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "SMBR objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  } 
+  
+  if (AccumulateGradients()) {
+    Vector<double> temp(gradients);
+    temp.Scale(1.0/tot_t_weighted);
+    if (print_avg_gradients) {
+      KALDI_LOG << "Vector of average gradients wrt output activations is: \n" << temp;
+    } else {
+      KALDI_VLOG(4) << "Vector of average gradients wrt output activations is: \n" << temp;
+    }
+  }
+  if (AccumulateOutput()) {
+    Vector<double> temp(output);
+    temp.Scale(1.0/tot_t_weighted);
+    if (print_avg_output) {
+      KALDI_LOG << "Average DNN output is: \n" << temp;
+    } else {
+      KALDI_VLOG(4) << "Average DNN output is: \n" << temp;
+    }
+  }
+}
+
+void DiscriminativeObjectiveInfo::PrintAvgGradientForPdf(int32 pdf_id) const {
+  if (pdf_id < gradients.Dim() and pdf_id >= 0) {
+    KALDI_LOG << "Average gradient wrt output activations of pdf " << pdf_id 
+              << " is " << gradients(pdf_id) / tot_t_weighted
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+  } 
+}
+
+
+
+}  // namespace discriminative
+}  // namespace kaldi
+
diff --git a/src/nnet3/discriminative-training.h b/src/nnet3/discriminative-training.h
new file mode 100644
index 00000000000..4ec7109d64f
--- /dev/null
+++ b/src/nnet3/discriminative-training.h
@@ -0,0 +1,251 @@
+// nnet3/discriminative-training.h
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+namespace kaldi {
+namespace discriminative {
+
+/* Options for discriminative training
+ *
+ * Legend:
+ * mmi - Maximum Mutual Information
+ * mpfe - Minimum Phone Frame Error
+ * smbr - State Minimum Bayes Risk
+ *
+ */
+struct DiscriminativeOptions {
+  std::string criterion; // one of {"mmi", "mpfe", "smbr"}
+                         // If the criterion does not match the supervision
+                         // object, the derivatives may not be very accurate
+  BaseFloat acoustic_scale; // e.g. 0.1
+  bool drop_frames; // for MMI, true if we ignore frames where alignment
+                    // pdf-id is not in the lattice.
+  bool one_silence_class;  // Affects MPFE and SMBR objectives 
+  BaseFloat boost; // for MMI, boosting factor (would be Boosted MMI)... e.g. 0.1.
+  
+  std::string silence_phones_str; // colon-separated list of integer ids of silence phones,
+                                  // for MPFE and SMBR objectives
+
+  // Cross-entropy regularization constant.  (e.g. try 0.1).  If nonzero,
+  // the network is expected to have an output named 'output-xent', which
+  // should have a softmax as its final nonlinearity.
+  BaseFloat xent_regularize;
+
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+  BaseFloat l2_regularize;
+  
+  // Options for debugging discriminative training
+  
+  // Accumulates gradients wrt nnet outputs
+  bool accumulate_gradients;  
+  
+  // Accumulates nnet output
+  bool accumulate_output;     
+  
+  // Applicable for debugging discriminative training when accumulate_gradients
+  // or accumulate_output is true 
+  int32 num_pdfs;             
+
+  DiscriminativeOptions(): criterion("smbr"), 
+                           acoustic_scale(0.1),
+                           drop_frames(false),
+                           one_silence_class(false),
+                           boost(0.0), 
+                           xent_regularize(0.0), 
+                           l2_regularize(0.0),
+                           accumulate_gradients(false), 
+                           accumulate_output(false),
+                           num_pdfs(0) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr', "
+                   "determines the objective function to use.  Should match "
+                   "option used when we created the examples.");
+    opts->Register("acoustic-scale", &acoustic_scale, "Weighting factor to "
+                   "apply to acoustic likelihoods.");
+    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
+                   "with no overlap of num and den pdf-ids");
+    opts->Register("boost", &boost, "Boosting factor for boosted MMI (e.g. 0.1)");
+    opts->Register("one-silence-class", &one_silence_class, "If true, newer "
+                   "behavior which will tend to reduce insertions "
+                   "when using MPFE or SMBR objective");
+    opts->Register("silence-phones", &silence_phones_str,
+                   "For MPFE or SMBR objectives, colon-separated list of "
+                   "integer ids of silence phones, e.g. 1:2:3");
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' output "
+                   "of the neural net.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for sequence training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
+    opts->Register("accumulate-gradients", &accumulate_gradients,
+                   "Accumulate gradients wrt nnet output "
+                   "for debugging discriminative training");
+    opts->Register("accumulate-output", &accumulate_output,
+                   "Accumulate nnet output "
+                   "for debugging discriminative training");
+    opts->Register("num-pdfs", &num_pdfs,
+                   "Number of pdfs; "
+                   "applicable when accumulate-output or accumulate-gradients "
+                   "is true for discriminative training");
+  }
+};
+
+struct DiscriminativeObjectiveInfo {
+  double tot_t;          // total number of frames
+  double tot_t_weighted; // total number of frames times weight.
+  double tot_objf;      // for 'mmi', the (weighted) denominator likelihood; for
+                        // everything else, the objective function.
+  double tot_num_count; // total count of numerator posterior 
+  double tot_den_count; // total count of denominator posterior 
+  double tot_num_objf;  // for 'mmi', the (weighted) numerator likelihood; for
+                        // everything else 0
+  
+  double tot_l2_term;   // l2 regularization objective
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+
+  // Options for debugging discriminative training
+  
+  // Accumulates gradients wrt nnet outputs
+  bool accumulate_gradients;  
+  
+  // Accumulates nnet output
+  bool accumulate_output;     
+  
+  // Applicable for debugging discriminative training when accumulate_gradients
+  // or accumulate_output is true 
+  int32 num_pdfs;             
+
+  // Used to accumulates gradients wrt nnet outputs
+  // when accumulate_gradients is true
+  CuVector<double> gradients;
+  // Used to accumulates output when accumulate_output is true
+  CuVector<double> output;
+
+  // Print statistics for the criterion
+  void Print(const std::string &criterion, 
+             bool print_avg_gradients = false, 
+             bool print_avg_output = false) const;
+
+  // Print all accumulated statistics for debugging
+  void PrintAll(const std::string &criterion) const {
+    Print(criterion, true, true);
+  }
+
+  // Print the gradient wrt nnet output accumulated for a pdf
+  void PrintAvgGradientForPdf(int32 pdf_id) const;
+
+  // Add stats from another object
+  void Add(const DiscriminativeObjectiveInfo &other);
+
+  // Returns the objective function value for the criterion
+  inline double TotalObjf(const std::string &criterion) const {
+    if (criterion == "mmi") return (tot_num_objf - tot_objf);
+    return tot_objf;
+  }
+
+  // Returns true if accumulate_gradients is true 
+  // and the gradients vector has been resized to store the 
+  // accumulated gradients
+  inline bool AccumulateGradients() const {
+    return accumulate_gradients && gradients.Dim() > 0;
+  }
+
+  // Returns true if accumulate_output is true 
+  // and the output vector has been resized to store the 
+  // accumulated nnet output 
+  inline bool AccumulateOutput() const {
+    return accumulate_output && output.Dim() > 0;
+  }
+
+  // Empty constructor
+  DiscriminativeObjectiveInfo();
+
+  // Constructor preparing to gradients or output to be accumulated
+  DiscriminativeObjectiveInfo(int32 num_pdfs);
+
+  // Constructor from config options
+  DiscriminativeObjectiveInfo(const DiscriminativeOptions &opts);
+  
+  // Reset statistics
+  void Reset();
+  
+  void Configure(const DiscriminativeOptions &opts);
+};
+
+/**
+   This function does forward-backward on the numerator and denominator 
+   lattices and computes derivates wrt to the output for the specified 
+   objective function.
+
+   @param [in] opts        Struct containing options
+   @param [in] tmodel       Transition model
+   @param [in] log_priors   Vector of log-priors for pdfs
+   @param [in] supervision  The supervision object, containing the numerator
+                            and denominator paths. The denominator is 
+                            always a lattice. The numerator is an alignment.
+   @param [in] nnet_output  The output of the neural net; dimension must equal
+                          ((supervision.num_sequences * supervision.frames_per_sequence) by
+                            tmodel.NumPdfs()).
+
+   @param [out] stats       Statistics accumulated during training such as 
+                            the objective function and the total weight.
+   @param [out] xent_output_deriv  If non-NULL, then the xent objective derivative
+                           (which equals a posterior from the numerator forward-backward,
+                           scaled by the supervision weight) is written to here.  This will
+                           be used in the cross-entropy regularization code.  
+*/
+void ComputeDiscriminativeObjfAndDeriv(
+    const DiscriminativeOptions &opts,
+    const TransitionModel &tmodel,
+    const CuVectorBase<BaseFloat> &log_priors,
+    const DiscriminativeSupervision &supervision,
+    const CuMatrixBase<BaseFloat> &nnet_output,
+    DiscriminativeObjectiveInfo *stats,
+    CuMatrixBase<BaseFloat> *nnet_output_deriv,
+    CuMatrixBase<BaseFloat> *xent_output_deriv);
+
+}  // namespace discriminative
+}  // namespace kaldi
+
+#endif  // KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+
diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index a2c6d07105b..9f4136a26b1 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -307,7 +307,7 @@ void UnitTestPreconditionDirectionsOnline() {
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual(row_prod1, row_prod2, 1.0e-02f);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 3b351256090..5a3de5c6074 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -112,9 +112,9 @@ void NnetDecodableBase::EnsureFrameIsComputed(int32 subsampled_frame) {
   int32 subsampling_factor = opts_.frame_subsampling_factor,
       subsampled_frames_per_chunk = opts_.frames_per_chunk / subsampling_factor,
       start_subsampled_frame = subsampled_frame,
-     num_subsampled_frames = std::min<int32>(num_subsampled_frames_ -
-                                             start_subsampled_frame,
-                                             subsampled_frames_per_chunk),
+      num_subsampled_frames = std::min<int32>(num_subsampled_frames_ -
+                                              start_subsampled_frame,
+                                              subsampled_frames_per_chunk),
       last_subsampled_frame = start_subsampled_frame + num_subsampled_frames - 1;
   KALDI_ASSERT(num_subsampled_frames > 0);
   // the output-frame numbers are the subsampled-frame numbers
@@ -122,8 +122,15 @@ void NnetDecodableBase::EnsureFrameIsComputed(int32 subsampled_frame) {
       last_output_frame = last_subsampled_frame * subsampling_factor;
 
   KALDI_ASSERT(opts_.extra_left_context >= 0 && opts_.extra_right_context >= 0);
-  int32 left_context = nnet_left_context_ + opts_.extra_left_context,
-      right_context = nnet_right_context_ + opts_.extra_right_context;
+  int32 extra_left_context = opts_.extra_left_context,
+      extra_right_context = opts_.extra_right_context;
+  if (first_output_frame == 0 && opts_.extra_left_context_initial >= 0)
+    extra_left_context = opts_.extra_left_context_initial;
+  if (last_subsampled_frame == num_subsampled_frames_ - 1 &&
+      opts_.extra_right_context_final >= 0)
+    extra_right_context = opts_.extra_right_context_final;
+  int32 left_context = nnet_left_context_ + extra_left_context,
+      right_context = nnet_right_context_ + extra_right_context;
   int32 first_input_frame = first_output_frame - left_context,
       last_input_frame = last_output_frame + right_context,
       num_input_frames = last_input_frame + 1 - first_input_frame;
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 7faa0755e74..45652c8e4ba 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -38,6 +38,8 @@ namespace nnet3 {
 struct NnetSimpleComputationOptions {
   int32 extra_left_context;
   int32 extra_right_context;
+  int32 extra_left_context_initial;
+  int32 extra_right_context_final;
   int32 frame_subsampling_factor;
   int32 frames_per_chunk;
   BaseFloat acoustic_scale;
@@ -47,7 +49,9 @@ struct NnetSimpleComputationOptions {
 
   NnetSimpleComputationOptions():
       extra_left_context(0),
-      extra_right_context(0),      
+      extra_right_context(0),
+      extra_left_context_initial(-1),
+      extra_right_context_final(-1),
       frame_subsampling_factor(1),
       frames_per_chunk(50),
       acoustic_scale(0.1),
@@ -58,14 +62,20 @@ struct NnetSimpleComputationOptions {
                    "Number of frames of additional left-context to add on top "
                    "of the neural net's inherent left context (may be useful in "
                    "recurrent setups");
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
-                   "Required if the frame-rate of the output (e.g. in 'chain' "
-                   "models) is less than the frame-rate of the original "
-                   "alignment.");
     opts->Register("extra-right-context", &extra_right_context,
                    "Number of frames of additional right-context to add on top "
                    "of the neural net's inherent right context (may be useful in "
                    "recurrent setups");
+    opts->Register("extra-left-context-initial", &extra_left_context_initial,
+                   "If >0, overrides the --extra-left-context value at the start "
+                   "of an utterance.");
+    opts->Register("extra-right-context-final", &extra_right_context_final,
+                   "If >0, overrides the --extra-right-context value at the end "
+                   "of an utterance.");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
     opts->Register("acoustic-scale", &acoustic_scale,
                    "Scaling factor for acoustic log-likelihoods");
     opts->Register("frames-per-chunk", &frames_per_chunk,
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index c9ea698d48b..810ee2b471a 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -427,7 +427,7 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
                                                 end = egs_.end();
   for (; iter != end; ++iter)
     prob_computer_->Compute(*iter);
-  const SimpleObjectiveInfo *objf_info =
+  const ChainObjectiveInfo *objf_info =
       prob_computer_->GetObjective("output");
   if (objf_info == NULL)
     KALDI_ERR << "Error getting objective info (unsuitable egs?)";
@@ -436,7 +436,7 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
   VectorizeNnet(deriv, nnet_params_deriv);
   // we prefer to deal with normalized objective functions.
   nnet_params_deriv->Scale(1.0 / objf_info->tot_weight);
-  return objf_info->tot_objective / objf_info->tot_weight;
+  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
 }
 
 
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 37ebe85de81..46e2b0c01dc 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -65,9 +65,18 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
   bool need_model_derivative = nnet_config_.compute_deriv,
       store_component_stats = false;
   ComputationRequest request;
+  // if the options specify cross-entropy regularization, we'll be computing
+  // this objective (not interpolated with the regular objective-- we give it a
+  // separate name), but currently we won't make it contribute to the
+  // derivative-- we just compute the derivative of the regular output.
+  // This is because in the place where we use the derivative (the
+  // model-combination code) we decided to keep it simple and just use the
+  // regular objective.
+  bool use_xent_regularization = (chain_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
   GetChainComputationRequest(nnet_, chain_eg, need_model_derivative,
-                             store_component_stats,
-                             &request);
+                             store_component_stats, use_xent_regularization,
+                             use_xent_derivative, &request);
   const NnetComputation *computation = compiler_.Compile(request);
   NnetComputer computer(nnet_config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
@@ -93,19 +102,24 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
       KALDI_ERR << "Network has no output named " << sup.name;
 
     const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
-    CuMatrix<BaseFloat> nnet_output_deriv;
+    bool use_xent = (chain_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
     if (nnet_config_.compute_deriv)
       nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                                kUndefined);
-
-    BaseFloat tot_objf, tot_weight;
-
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+      
+    BaseFloat tot_like, tot_l2_term, tot_weight;
+    
     ComputeChainObjfAndDeriv(chain_config_, den_graph_,
                              sup.supervision, nnet_output,
-                             &tot_objf, &tot_weight,
-                             (nnet_config_.compute_deriv ?
-                              &nnet_output_deriv : NULL));
-
+                             &tot_like, &tot_l2_term, &tot_weight,
+                             (nnet_config_.compute_deriv ? &nnet_output_deriv :
+                              NULL), (use_xent ? &xent_deriv : NULL));
+    
     // note: in this context we don't want to apply 'sup.deriv_weights' because
     // this code is used only in combination, where it's part of an L-BFGS
     // optimization algorithm, and in that case if there is a mismatch between
@@ -114,20 +128,33 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     // and conjugate gradient descent both rely on the derivatives being
     // accurate, and don't fail gracefully if the derivatives are not accurate).
 
-    SimpleObjectiveInfo &totals = objf_info_[sup.name];
+    ChainObjectiveInfo &totals = objf_info_[sup.name];
     totals.tot_weight += tot_weight;
-    totals.tot_objective += tot_objf;
+    totals.tot_like += tot_like;
+    totals.tot_l2_term += tot_l2_term;
 
     if (nnet_config_.compute_deriv)
       computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
 
+    if (use_xent) {
+      ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of '.supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_totals.tot_weight += tot_weight;
+      xent_totals.tot_like += xent_objf;
+    }
     num_minibatches_processed_++;
   }
 }
 
 bool NnetChainComputeProb::PrintTotalStats() const {
   bool ans = false;
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
       iter, end;
   iter = objf_info_.begin();
   end = objf_info_.end();
@@ -135,11 +162,21 @@ bool NnetChainComputeProb::PrintTotalStats() const {
     const std::string &name = iter->first;
     int32 node_index = nnet_.GetNodeIndex(name);
     KALDI_ASSERT(node_index >= 0);
-    const SimpleObjectiveInfo &info = iter->second;
-    KALDI_LOG << "Overall log-probability for '"
-              << name << "' is "
-              << (info.tot_objective / info.tot_weight) << " per frame"
-              << ", over " << info.tot_weight << " frames.";
+    const ChainObjectiveInfo &info = iter->second;
+    BaseFloat like = (info.tot_like / info.tot_weight),
+        l2_term = (info.tot_l2_term / info.tot_weight),
+        tot_objf = like + l2_term;
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " + " << l2_term << " = " << tot_objf << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    }
     if (info.tot_weight > 0)
       ans = true;
   }
@@ -147,9 +184,9 @@ bool NnetChainComputeProb::PrintTotalStats() const {
 }
 
 
-const SimpleObjectiveInfo* NnetChainComputeProb::GetObjective(
+const ChainObjectiveInfo* NnetChainComputeProb::GetObjective(
     const std::string &output_name) const {
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
       iter = objf_info_.find(output_name);
   if (iter != objf_info_.end())
     return &(iter->second);
diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h
index 797bd7c57d1..cb433b1ca4d 100644
--- a/src/nnet3/nnet-chain-diagnostics.h
+++ b/src/nnet3/nnet-chain-diagnostics.h
@@ -33,9 +33,23 @@ namespace kaldi {
 namespace nnet3 {
 
 
+struct ChainObjectiveInfo {
+  double tot_weight;
+  double tot_like;
+  double tot_l2_term;
+  ChainObjectiveInfo(): tot_weight(0.0),
+                        tot_like(0.0),
+                        tot_l2_term(0.0) { }
+};
+
 
-/** This class is for computing objective-function values in a nnet3+chain setup,
-    for diagnostics.  It also supports computing model derivatives.
+/** This class is for computing objective-function values in a nnet3+chain
+    setup, for diagnostics.  It also supports computing model derivatives.
+    Note: if the --xent-regularization option is nonzero, the cross-entropy
+    objective will be computed, and displayed when you call PrintTotalStats(),
+    but it will not contribute to model derivatives (there is no code to compute
+    the regularized objective function, and anyway it's not clear that we really
+    need this regularization in the combination phase).
  */
 class NnetChainComputeProb {
  public:
@@ -56,7 +70,7 @@ class NnetChainComputeProb {
 
   // returns the objective-function info for this output name (e.g. "output"),
   // or NULL if there is no such info.
-  const SimpleObjectiveInfo *GetObjective(const std::string &output_name) const;
+  const ChainObjectiveInfo *GetObjective(const std::string &output_name) const;
 
   // if config.compute_deriv == true, returns a reference to the
   // computed derivative.  Otherwise crashes.
@@ -75,7 +89,7 @@ class NnetChainComputeProb {
   Nnet *deriv_nnet_;
   int32 num_minibatches_processed_;  // this is only for diagnostics
 
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher> objf_info_;
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher> objf_info_;
 
 };
 
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 8c39829f650..74e8be80240 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -25,49 +25,6 @@ namespace kaldi {
 namespace nnet3 {
 
 
-// writes compressed as unsigned char a vector 'vec' that is required to have
-// values between 0 and 1.
-static inline void WriteVectorAsChar(std::ostream &os,
-                                     bool binary,
-                                     const VectorBase<BaseFloat> &vec) {
-  if (binary) {
-    int32 dim = vec.Dim();
-    std::vector<unsigned char> char_vec(dim);
-    const BaseFloat *data = vec.Data();
-    for (int32 i = 0; i < dim; i++) {
-      BaseFloat value = data[i];
-      KALDI_ASSERT(value >= 0.0 && value <= 1.0);
-      // below, the adding 0.5 is done so that we round to the closest integer
-      // rather than rounding down (since static_cast will round down).
-      char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
-    }
-    WriteIntegerVector(os, binary, char_vec);
-  } else {
-    // the regular floating-point format will be more readable for text mode.
-    vec.Write(os, binary);
-  }
-}
-
-// reads data written by WriteVectorAsChar.
-static inline void ReadVectorAsChar(std::istream &is,
-                                    bool binary,
-                                    Vector<BaseFloat> *vec) {
-  if (binary) {
-    BaseFloat scale = 1.0 / 255.0;
-    std::vector<unsigned char> char_vec;
-    ReadIntegerVector(is, binary, &char_vec);
-    int32 dim = char_vec.size();
-    vec->Resize(dim, kUndefined);
-    BaseFloat *data = vec->Data();
-    for (int32 i = 0; i < dim; i++)
-      data[i] = scale * char_vec[i];
-  } else {
-    vec->Read(is, binary);
-  }
-}
-
-
-
 void NnetChainSupervision::Write(std::ostream &os, bool binary) const {
   CheckDim();
   WriteToken(os, binary, "<NnetChainSup>");
@@ -359,18 +316,20 @@ void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
                                 bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
                                 ComputationRequest *request) {
   request->inputs.clear();
   request->inputs.reserve(eg.inputs.size());
   request->outputs.clear();
-  request->outputs.reserve(eg.outputs.size());
+  request->outputs.reserve(eg.outputs.size() * 2);
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
   for (size_t i = 0; i < eg.inputs.size(); i++) {
     const NnetIo &io = eg.inputs[i];
     const std::string &name = io.name;
     int32 node_index = nnet.GetNodeIndex(name);
-    if (node_index == -1 &&
+    if (node_index == -1 ||
         !nnet.IsInputNode(node_index))
       KALDI_ERR << "Nnet example has input named '" << name
                 << "', but no such input node is in the network.";
@@ -395,6 +354,19 @@ void GetChainComputationRequest(const Nnet &nnet,
     io_spec.name = name;
     io_spec.indexes = sup.indexes;
     io_spec.has_deriv = need_model_derivative;
+
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
   }
   // check to see if something went wrong.
   if (request->inputs.empty())
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 705e6f818f4..323e73da8da 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -177,11 +177,21 @@ void TruncateDerivWeights(int32 truncate,
      can create the ComputationRequest manually.  Assumes that if
      need_model_derivative is true, you will be supplying derivatives w.r.t. all
      outputs.
+
+     If use_xent_regularization == true, then it assumes that for each output
+     name (e.g. "output" in the eg, there is another output with the same
+     dimension and with the suffix "-xent" on its name, e.g. named
+     "output-xent".  The derivative w.r.t. the xent objective will only be
+     supplied to the nnet computation if 'use_xent_derivative' is true (we
+     propagate back the xent derivative to the model only in training, not in
+     model-combination in nnet3-chain-combine).
 */
 void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
                                 bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
                                 ComputationRequest *computation_request);
 
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 781fc96417b..dee0eee2a33 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -50,9 +50,11 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
 void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   bool need_model_derivative = true;
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
   ComputationRequest request;
   GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative,
                              nnet_config.store_component_stats,
+                             use_xent_regularization, need_model_derivative,
                              &request);
   const NnetComputation *computation = compiler_.Compile(request);
 
@@ -108,23 +110,50 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
                                           nnet_output.NumCols(),
                                           kUndefined);
 
-    BaseFloat tot_objf, tot_weight;
+    bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    BaseFloat tot_objf, tot_l2_term, tot_weight;
 
     ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
                              sup.supervision, nnet_output,
-                             &tot_objf, &tot_weight,
-                             &nnet_output_deriv);
+                             &tot_objf, &tot_l2_term, &tot_weight,
+                             &nnet_output_deriv,
+                             (use_xent ? &xent_deriv : NULL));
+
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_,
+                                        tot_weight, xent_objf);
+    }
 
     if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
       CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
       nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
     computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
-                                     tot_weight, tot_objf);
+                                     tot_weight, tot_objf, tot_l2_term);
+
+    if (use_xent) {
+      xent_deriv.Scale(opts_.chain_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
   }
 }
 
@@ -137,7 +166,7 @@ bool NnetChainTrainer::PrintTotalStats() const {
   for (; iter != end; ++iter) {
     const std::string &name = iter->first;
     const ObjectiveFunctionInfo &info = iter->second;
-    ans = ans || info.PrintTotalStats(name);
+    ans = info.PrintTotalStats(name) || ans;
   }
   return ans;
 }
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 11a0e0cfd6d..0b651372fe1 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -43,7 +43,7 @@ struct NnetChainTrainingOptions {
     chain_config.Register(opts);
     opts->Register("apply-deriv-weights", &apply_deriv_weights,
                    "If true, apply the per-frame derivative weights stored with "
-                   "the example (you'll normally want to leave this as true.");
+                   "the example");
   }
 };
 
diff --git a/src/nnet3/nnet-common-test.cc b/src/nnet3/nnet-common-test.cc
index 318c19ee8b8..856797bc5cc 100644
--- a/src/nnet3/nnet-common-test.cc
+++ b/src/nnet3/nnet-common-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-common-test.cc
 
-// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -58,6 +59,44 @@ void UnitTestIndexIo() {
   }
 }
 
+void UnitTestCindexIo() {
+  std::vector<Cindex> cindexes(RandInt(0, 10));
+
+  for (int32 i = 0; i < cindexes.size(); i++) {
+    if (i == 0 || RandInt(0, 1) == 0) {
+      cindexes[i].first = RandInt(0, 127);
+    } else {
+      cindexes[i].first = cindexes[i-1].first;
+    }
+    if (i == 0 || RandInt(0, 1) == 0) {
+      cindexes[i].second.n = RandInt(-1, 2);
+      cindexes[i].second.t = RandInt(-150, 150);
+      cindexes[i].second.x = RandInt(-1, 1);
+    } else {
+      // this case gets optimized while writing. (if abs(diff-in-t) < 125).
+      cindexes[i].second.n = cindexes[i-1].second.n;
+      cindexes[i].second.t = cindexes[i-1].second.t + RandInt(-127, 127);
+      cindexes[i].second.x = cindexes[i-1].second.x;
+    }
+  }
+  
+  std::ostringstream os;
+  bool binary = (RandInt(0, 1) == 0);
+  WriteCindexVector(os, binary, cindexes);
+  std::vector<Cindex> cindexes2;
+  if (RandInt(0, 1) == 0)
+    cindexes2 = cindexes;
+  std::istringstream is(os.str());
+  ReadCindexVector(is, binary, &cindexes2);
+  if (cindexes != cindexes2) {
+    WriteCindexVector(std::cerr, false, cindexes);
+    std::cerr << "  vs. \n";
+    WriteCindexVector(std::cerr, false, cindexes2);
+    std::cerr << "\n";
+    KALDI_ERR << "Indexes differ.";
+  }
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -65,8 +104,10 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  for (int32 i = 0; i < 50; i++)
+  for (int32 i = 0; i < 50; i++) {
     UnitTestIndexIo();
+    UnitTestCindexIo();
+  }
 
   KALDI_LOG << "Nnet-common tests succeeded.";
 
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 9e502fdecdc..9df01d4f048 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-common.cc
 
 // Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -151,7 +152,189 @@ void ReadIndexVector(std::istream &is, bool binary,
   }
 }
 
+static void WriteCindexVectorElementBinary(
+    std::ostream &os,
+    const std::vector<Cindex> &vec,
+    int32 i) {
+  bool binary = true;
+  int32 node_index = vec[i].first;
+  Index index = vec[i].second;
+  if (i == 0 || node_index != vec[i-1].first) {
+    // '|' into ranges that each have all the same node name, like:
+    // [node_1: index_1 index_2] [node_2: index_3 index_4]
+    os.put('|');
+    WriteBasicType(os, binary, node_index);
+  }  
+  if (i == 0) {
+    if (index.n == 0 && index.x == 0 &&
+        std::abs(index.t) < 125) {
+      // handle this common case in one character.
+      os.put(static_cast<signed char>(index.t));
+    } else if (index.t == 0 && index.x == 0 &&
+        std::abs(index.n) < 2) {
+      // handle this common case in one character.
+      os.put(static_cast<signed char>(index.n + 125));
+    } else {  // handle the general case less efficiently.
+      os.put(127);
+      WriteBasicType(os, binary, index.n);
+      WriteBasicType(os, binary, index.t);
+      WriteBasicType(os, binary, index.x);
+    }
+  } else {
+    Index last_index = vec[i-1].second;
+    if (index.n == last_index.n && index.x == last_index.x &&
+        std::abs(index.t - last_index.t) < 124) {
+      signed char c = index.t - last_index.t;
+      os.put(c);
+    } else if (index.t == last_index.t && index.x == last_index.x &&
+        std::abs(index.n - last_index.n) < 2) {
+      signed char c = index.n - last_index.n;
+      os.put(c + 125);
+    } else {  // handle the general case less efficiently.
+      os.put(127);
+      WriteBasicType(os, binary, index.n);
+      WriteBasicType(os, binary, index.t);
+      WriteBasicType(os, binary, index.x);
+    }
+  }
+  if (!os.good())
+    KALDI_ERR << "Output stream error detected";
+}
+
+static void ReadCindexVectorElementBinary(
+    std::istream &is,
+    int32 i,
+    std::vector<Cindex> *vec) {
+  bool binary = true;
+  Index &index = (*vec)[i].second;
+  if (!is.good())
+    KALDI_ERR << "End of file while reading vector of Cindex.";
+  if (is.peek() == static_cast<int>('|')) {
+    is.get();
+    ReadBasicType(is, binary, &((*vec)[i].first));
+  } else {
+    (*vec)[i].first = (*vec)[i-1].first;
+  }
+  signed char c = is.get();
+  if (i == 0) {
+    if (std::abs(int(c)) < 124) {
+      index.n = 0;
+      index.t = c;
+      index.x = 0;
+    } else if (std::abs(int(c)) < 127) {
+      index.n = c - 125;
+      index.t = 0;
+      index.x = 0;
+    } else {
+      if (c != 127)
+        KALDI_ERR << "Unexpected character " << c
+                  << " encountered while reading Cindex vector.";
+      ReadBasicType(is, binary, &(index.n));
+      ReadBasicType(is, binary, &(index.t));
+      ReadBasicType(is, binary, &(index.x));
+    }
+  } else {
+    Index &last_index = (*vec)[i-1].second;
+    if (std::abs(int(c)) < 124) {
+      index.n = last_index.n;
+      index.t = last_index.t + c;
+      index.x = last_index.x;
+    } else if (std::abs(int(c)) < 127) {
+      index.n = last_index.n + c - 125;
+      index.t = last_index.t;
+      index.x = last_index.x;
+    } else {
+      if (c != 127)
+        KALDI_ERR << "Unexpected character " << c
+                  << " encountered while reading Cindex vector.";
+      ReadBasicType(is, binary, &(index.n));
+      ReadBasicType(is, binary, &(index.t));
+      ReadBasicType(is, binary, &(index.x));
+    }
+  }
+}
 
+// This function writes elements of a Cindex vector in a compact form.
+// which is similar as the output of PrintCindexes. The vector is divided
+// into ranges that each have all the same node name, like:
+// [node_1: index_1 index_2] [node_2: index_3 index_4]
+void WriteCindexVector(std::ostream &os, bool binary,
+                       const std::vector<Cindex> &vec) {
+  // This token will make it easier to write back-compatible code if we later
+  // change the format.
+  WriteToken(os, binary, "<I1V>");
+  int32 size = vec.size();
+  WriteBasicType(os, binary, size);
+  if (!binary) {  // In text mode we just use the native Write functionality.
+    for (int32 i = 0; i < size; i++) {
+      int32 node_index = vec[i].first;
+      if (i == 0 || node_index != vec[i-1].first) {
+        if (i > 0)
+          os.put(']');
+        os.put('[');
+        WriteBasicType(os, binary, node_index);
+        os.put(':');
+      } 
+      vec[i].second.Write(os, binary);
+      if (i == size - 1)
+        os.put(']');
+    } 
+  } else {
+    for (int32 i = 0; i < size; i++)
+      WriteCindexVectorElementBinary(os, vec, i);
+  }
+}
+
+void ReadCindexVector(std::istream &is, bool binary,
+                      std::vector<Cindex> *vec) {
+  ExpectToken(is, binary, "<I1V>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 0) {
+    KALDI_ERR << "Error reading Index vector: size = "
+              << size;
+  }
+  vec->resize(size);
+  if (!binary) {
+    for (int32 i = 0; i < size; i++) {
+      is >> std::ws;
+      if (is.peek() == static_cast<int>(']') || i == 0) {
+        if (i != 0)
+          is.get();
+        is >> std::ws;
+        if (is.peek() == static_cast<int>('[')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see [, saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+        ReadBasicType(is, binary, &((*vec)[i].first));
+        is >> std::ws;
+        if (is.peek() == static_cast<int>(':')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see :, saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+      } else {
+        (*vec)[i].first = (*vec)[i-1].first;
+      }
+      (*vec)[i].second.Read(is, binary);
+      if (i == size - 1) { 
+        is >> std::ws;
+        if (is.peek() == static_cast<int>(']')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see ], saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+      }
+    }
+  } else {
+    for (int32 i = 0; i < size; i++)
+      ReadCindexVectorElementBinary(is, i, vec);
+  }
+}
 
 size_t IndexHasher::operator () (const Index &index) const {
   // The numbers that appear below were chosen arbitrarily from a list of primes
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index c332e32b648..f8140e62f12 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-common.h
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+// Copyright      2015  Johns Hopkins University (author: Daniel Pove
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -68,6 +68,21 @@ struct Index {
   void Read(std::istream &os, bool binary);
 };
 
+
+// This struct can be used as a comparison object when you want to
+// sort the indexes first on n, then x, then t (Index's own comparison
+// object will sort first on t, then n, then x)
+struct IndexLessNxt {
+  inline bool operator ()(const Index &a, const Index &b) const {
+    if (a.n < b.n) { return true; }
+    else if (a.n > b.n) { return false; }
+    else if (a.x < b.x) { return true; }
+    else if (a.x > b.x) { return false; }
+    else return (a.t < b.t);
+  }
+};
+
+
 // this will be used only for debugging output.
 std::ostream &operator << (std::ostream &ostream, const Index &index);
 
@@ -75,7 +90,7 @@ std::ostream &operator << (std::ostream &ostream, const Index &index);
 void WriteIndexVector(std::ostream &os, bool binary,
                       const std::vector<Index> &vec);
 
-void ReadIndexVector(std::istream &os, bool binary,
+void ReadIndexVector(std::istream &is, bool binary,
                      std::vector<Index> *vec);
 
 
@@ -123,6 +138,11 @@ void PrintCindexes(std::ostream &ostream,
 void AppendCindexes(int32 node, const std::vector<Index> &indexes,
                     std::vector<Cindex> *out);
 
+void WriteCindexVector(std::ostream &os, bool binary,
+                       const std::vector<Cindex> &vec);
+
+void ReadCindexVector(std::istream &is, bool binary,
+                      std::vector<Cindex> *vec);
 
 // this function prints a vector of integers in a human-readable
 // way, for pretty-printing; it outputs ranges and repeats in
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index 3a492a70d20..d405fd0f5fa 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -28,7 +28,7 @@ namespace nnet3 {
 void UnitTestNnetCompile() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -37,13 +37,13 @@ void UnitTestNnetCompile() {
       std::istringstream is(configs[j]);
       nnet.ReadConfig(is);
     }
-    
+
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
     KALDI_LOG << "Computation request is:";
     request.Print(std::cerr);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -62,7 +62,7 @@ void UnitTestNnetCompile() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  //SetVerboseLevel(2);
+  // SetVerboseLevel(2);
 
   UnitTestNnetCompile();
 
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 0a0e6995d5e..ea8f620c6ce 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -163,6 +163,30 @@ void Compiler::ComputeDerivNeeded(
   }
 }
 
+MatrixStrideType Compiler::GetStrideType(int32 node_index) const {
+  int32 component_node_index;
+  bool is_input;
+  if (nnet_.IsComponentInputNode(node_index)) {
+    // this node is for the input to a component.
+    component_node_index = node_index + 1;
+    is_input = true;
+  } else if (nnet_.IsComponentNode(node_index)) {
+    component_node_index = node_index;
+    is_input = false;
+  } else {
+    return kDefaultStride;
+  }
+  const NetworkNode &node = nnet_.GetNode(component_node_index);
+  const Component *c = nnet_.GetComponent(node.u.component_index);
+  if (is_input) {
+    return (c->Properties() & kInputContiguous) ?
+        kStrideEqualNumCols : kDefaultStride;
+  } else {
+    return (c->Properties() & kOutputContiguous) ?
+        kStrideEqualNumCols : kDefaultStride;
+  }
+}
+
 
 // Note: "by_step" is an input but is passed as a pointer because this
 // function destroys it.
@@ -189,9 +213,12 @@ void Compiler::CreateStepInfo(
     int32 num_rows = num_ids, num_cols = node.Dim(nnet_);
 
     if (node.node_type != kDimRange) {
-      this_info.value = computation->NewMatrix(num_rows, num_cols);
+      MatrixStrideType stride_type = GetStrideType(this_info.node_index);
+      this_info.value = computation->NewMatrix(num_rows, num_cols,
+                                               stride_type);
       if (deriv_needed[step])
-        this_info.deriv = computation->NewMatrix(num_rows, num_cols);
+        this_info.deriv = computation->NewMatrix(num_rows, num_cols,
+                                                 stride_type);
       if (node.node_type == kComponent)
         KALDI_PARANOID_ASSERT(step > 0 &&  steps_[step-1].output_indexes ==
                               this_info.output_indexes);
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index d9301a395e3..1b4953668ea 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -118,6 +118,11 @@ class Compiler {
                       std::vector<std::vector<int32> > *by_step,
                       NnetComputation *computation);
 
+  // Gets the stride type, kDefaultStride or kStrideEqualNumCols,
+  // at the output of this node: interrogates component flags
+  // looking for kInputContiguous or kOutputContiguous.
+  MatrixStrideType GetStrideType(int32 node_index) const;
+
 
   // Miscellaneous info pertaining to various steps of the computation.  Indexed
   // by step-index.
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index d73f86d7542..cdb43473090 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -33,6 +33,35 @@
 namespace kaldi {
 namespace nnet3 {
 
+ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::ReadNew(std::istream &is,
+                                                                  bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token); // e.g. "<DistributePrecomputedComponentIndexes>".
+  token.erase(0, 1); // erase "<".
+  token.erase(token.length()-1); // erase ">".
+  ComponentPrecomputedIndexes *ans = NewComponentPrecomputedIndexesOfType(token);
+  if (!ans)
+   KALDI_ERR << "Unknown ComponentPrecomputedIndexes type " << token;
+  ans->Read(is, binary);
+  return ans;
+}
+
+ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecomputedIndexesOfType(
+                                           const std::string &cpi_type) {
+  ComponentPrecomputedIndexes *ans = NULL;
+  if (cpi_type == "DistributeComponentPrecomputedIndexes") {
+    ans = new DistributeComponentPrecomputedIndexes();
+  } else if (cpi_type == "StatisticsExtractionComponentPrecomputedIndexes") {
+    ans = new StatisticsExtractionComponentPrecomputedIndexes();
+  } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") {
+    ans = new StatisticsPoolingComponentPrecomputedIndexes();
+  }
+  if (ans != NULL) {
+    KALDI_ASSERT(cpi_type == ans->Type());
+  }
+  return ans;
+}
+
 // static
 Component* Component::ReadNew(std::istream &is, bool binary) {
   std::string token;
@@ -90,8 +119,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new ClipGradientComponent();
   } else if (component_type == "ElementwiseProductComponent") {
     ans = new ElementwiseProductComponent();
-  } else if (component_type == "Convolutional1dComponent") {
-    ans = new Convolutional1dComponent();
   } else if (component_type == "ConvolutionComponent") {
     ans = new ConvolutionComponent();
   } else if (component_type == "MaxpoolingComponent") {
@@ -108,6 +135,12 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new BlockAffineComponent();
   } else if (component_type == "NaturalGradientRepeatedAffineComponent") {
     ans = new NaturalGradientRepeatedAffineComponent();
+  } else if (component_type == "StatisticsExtractionComponent") {
+    ans = new StatisticsExtractionComponent();
+  } else if (component_type == "StatisticsPoolingComponent") {
+    ans = new StatisticsPoolingComponent();
+  } else if (component_type == "ConstantFunctionComponent") {
+    ans = new ConstantFunctionComponent();
   }
   if (ans != NULL) {
     KALDI_ASSERT(component_type == ans->Type());
@@ -251,9 +284,19 @@ void NonlinearComponent::ZeroStats() {
 
 std::string NonlinearComponent::Info() const {
   std::stringstream stream;
-  KALDI_ASSERT(InputDim() == OutputDim());  // always the case
-  stream << Type() << ", dim=" << InputDim();
+  if (InputDim() == OutputDim())
+    stream << Type() << ", dim=" << InputDim();
+  else
+    stream << Type() << ", input-dim=" << InputDim()
+           << ", output-dim=" << OutputDim()
+           << ", add-log-stddev=true";
 
+  if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold))
+    stream << ", self-repair-lower-threshold=" << self_repair_lower_threshold_;
+  if (self_repair_upper_threshold_ != BaseFloat(kUnsetThreshold))
+    stream << ", self-repair-upper-threshold=" << self_repair_upper_threshold_;
+  if (self_repair_scale_ != 0.0)
+    stream << ", self-repair-scale=" << self_repair_scale_;
   if (count_ > 0 && value_sum_.Dim() == dim_ &&  deriv_sum_.Dim() == dim_) {
     stream << ", count=" << std::setprecision(3) << count_
            << std::setprecision(6);
@@ -296,29 +339,32 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
   ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
   ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
   ReadBasicType(is, binary, &dim_); // Read dimension.
-  std::string tok; // TODO: remove back-compatibility code.
-  ReadToken(is, binary, &tok);
-  if (tok == "<ValueSum>") {
-    // this branch is for back compatibility.  TODO: delete it
-    // after Dec 2015.
-    value_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<DerivSum>");
-    deriv_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    ReadBasicType(is, binary, &count_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    // The new format is more readable as we write values that are normalized by
-    // the count.
-    KALDI_ASSERT(tok == "<ValueAvg>");
-    value_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<DerivAvg>");
-    deriv_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    ReadBasicType(is, binary, &count_);
-    value_sum_.Scale(count_);
-    deriv_sum_.Scale(count_);
-    ExpectToken(is, binary, ostr_end.str());
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<SelfRepairLowerThreshold>") {
+    ReadBasicType(is, binary, &self_repair_lower_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairUpperThreshold>") {
+    ReadBasicType(is, binary, &self_repair_upper_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairScale>") {
+    ReadBasicType(is, binary, &self_repair_scale_);
+    ReadToken(is, binary, &token);
+  }
+  if (token != ostr_end.str()) {
+    KALDI_ERR << "Expected token " << ostr_end.str()
+              << ", got " << token;
   }
 }
 
@@ -343,21 +389,45 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   temp.Write(os, binary);
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
+  if (self_repair_lower_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairLowerThreshold>");
+    WriteBasicType(os, binary, self_repair_lower_threshold_);
+  }
+  if (self_repair_upper_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairUpperThreshold>");
+    WriteBasicType(os, binary, self_repair_upper_threshold_);
+  }
+  if (self_repair_scale_ != 0.0) {
+    WriteToken(os, binary, "<SelfRepairScale>");
+    WriteBasicType(os, binary, self_repair_scale_);
+  }
   WriteToken(os, binary, ostr_end.str());
 }
 
+NonlinearComponent::NonlinearComponent():
+    dim_(-1), count_(0.0),
+    self_repair_lower_threshold_(kUnsetThreshold),
+    self_repair_upper_threshold_(kUnsetThreshold),
+    self_repair_scale_(0.0) { }
+
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_) { }
+    count_(other.count_),
+    self_repair_lower_threshold_(other.self_repair_lower_threshold_),
+    self_repair_upper_threshold_(other.self_repair_upper_threshold_),
+    self_repair_scale_(other.self_repair_scale_) { }
 
 void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 dim;
-  bool ok = cfl->GetValue("dim", &dim);
-  if (!ok || cfl->HasUnusedValues() || dim <= 0)
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("self-repair-lower-threshold", &self_repair_lower_threshold_);
+  cfl->GetValue("self-repair-upper-threshold", &self_repair_upper_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  if (!ok || cfl->HasUnusedValues() || dim_ <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim);
 }
 
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 93cc0769bf6..81631397fad 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -2,6 +2,7 @@
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Guoguo Chen
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -65,19 +66,24 @@ enum ComponentProperties {
                              // the indexes (otherwise we can skip calling it).
                              // Must not be set for simple components.
   kBackpropAdds = 0x080,   // true if the Backprop function adds to, rather than
-                           // setting, the "in_deriv" output.  The Component chooses
-                           // whether to add or set, and the calling code has to
-                           // accommodate it.
+                           // setting, the "in_deriv" output.  The Component
+                           // chooses whether to add or set, and the calling
+                           // code has to accommodate it.  Note: in the case of
+                           // in-place backprop, this flag has no effect.
   kBackpropNeedsInput = 0x100,  // true if backprop operation needs access to
                                 // forward-pass input.
   kBackpropNeedsOutput = 0x200,  // true if backprop operation needs access to
                                  // forward-pass output (e.g. true for Sigmoid).
   kBackpropInPlace = 0x400,   // true if we can do the backprop operation in-place
                              // (input and output matrices may be the same).
-  kStoresStats = 0x800       // true if the StoreStats operation stores
+  kStoresStats = 0x800,      // true if the StoreStats operation stores
                              // statistics e.g. on average node activations and
                              // derivatives of the nonlinearity, (as it does for
                              // Tanh, Sigmoid, ReLU and Softmax).
+  kInputContiguous = 0x1000,  // true if the component requires its input data (and
+                              // input derivatives) to have Stride()== NumCols().
+  kOutputContiguous = 0x2000  // true if the component requires its input data (and
+                              // output derivatives) to have Stride()== NumCols().
 };
 
 
@@ -93,6 +99,13 @@ enum ComponentProperties {
 class ComponentPrecomputedIndexes {
  public:
   virtual ComponentPrecomputedIndexes *Copy() const = 0;
+  virtual void Write(std::ostream &os, bool binary) const = 0;
+  virtual void Read(std::istream &os, bool binary) = 0;
+  virtual std::string Type() const = 0;
+  static ComponentPrecomputedIndexes* ReadNew(std::istream &is, bool binary);
+  // cpi stands for component_precomputed_indexes
+  static ComponentPrecomputedIndexes* NewComponentPrecomputedIndexesOfType(
+                                           const std::string &cpi_type);
   virtual ~ComponentPrecomputedIndexes() { }
 };
 
@@ -135,6 +148,8 @@ class Component {
   ///   \param [in] out_deriv  The derivative at the output of this component.
   ///   \param [out] to_update  If model update is desired, the Component
   ///       to be updated, else NULL.  Does not have to be identical to this.
+  ///       If supplied, you can assume that
+  ///       to_update->Properties() & kUpdatableComponent is nonzero.
   ///   \param [out] in_deriv   The derivative at the input of this component,
   ///       if needed (else NULL).   If  Properties()&kBackpropInPlace, may be
   ///       the same matrix as out_deriv.  If Properties()&kBackpropAdds, this
@@ -316,7 +331,7 @@ class Component {
   /// This virtual function when called by
   //    -- an UpdatableComponent scales the parameters
   ///      by "scale" when called by an UpdatableComponent.
-  //    -- a NonLinear component it relates to scaling activation stats, not parameters.
+  //    -- a Nonlinear component it relates to scaling activation stats, not parameters.
   virtual void Scale(BaseFloat scale) {};
 
   /// This virtual function when called by
@@ -341,6 +356,8 @@ class Component {
  * Class UpdatableComponent is a Component which has trainable parameters; it
  * extends the interface of Component.  This is a base-class for Components with
  * parameters.  See comment by declaration of kUpdatableComponent.
+ * The functions in this interface must only be called if the component returns
+ * the kUpdatable flag.
  */
 class UpdatableComponent: public Component {
  public:
@@ -367,11 +384,16 @@ class UpdatableComponent: public Component {
   /// This function is to be used in testing.  It adds unit noise times "stddev"
   /// to the parameters of the component.
   virtual void PerturbParams(BaseFloat stddev) = 0;
-  /// Sets the learning rate of gradient descent
-  virtual void SetLearningRate(BaseFloat lrate) {
+
+  /// Sets the learning rate of gradient descent- gets multiplied by
+  /// learning_rate_factor_.
+  virtual void SetUnderlyingLearningRate(BaseFloat lrate) {
     learning_rate_ = lrate * learning_rate_factor_;
   }
 
+  /// Sets the learning rate directly, bypassing learning_rate_factor_.
+  virtual void SetActualLearningRate(BaseFloat lrate) { learning_rate_ = lrate; }
+
   /// Gets the learning rate of gradient descent.  Note: if you call
   /// SetLearningRate(x), and learning_rate_factor_ != 1.0,
   /// a different value than x will returned.
@@ -426,15 +448,32 @@ class UpdatableComponent: public Component {
 /// during training.
 class NonlinearComponent: public Component {
  public:
-  void Init(int32 dim) { dim_ = dim; count_ = 0.0; }
-  explicit NonlinearComponent(int32 dim) { Init(dim); }
-  NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
+
+  NonlinearComponent();
   explicit NonlinearComponent(const NonlinearComponent &other);
 
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
 
-  /// We implement InitFromConfig at this level.
+  // We implement InitFromConfig at this level.
+  // supported config parameters and their defaults:
+  //   dim=-1  self-repair-lower-threshold=-1000  self-repair-upper-threshold=-1000
+  //     self-repair-constant=0.0
+  // the 'self-repair' stuff is 'self-repairing' nonlinearities-- they add small
+  // quantities to the derivative to attempt to keep the average value (for
+  // bounded nonlinearities) or average derivative (for ReLU) for each
+  // dimension within a given range.  The default ranges (if you don't
+  // specify self-repair-lower-threshold or self-repair-upper-threshold) are
+  // dependent on the nonlinearity and are set in their Backprop functions.
+  // To activate this code you have to set self-repair-constant to a number >0 like
+  // 0.0001 when initializing the ReLU (this is a scaling factor on the 'fake
+  // derivative').  This code is only activated if derivative and value stats
+  // are present in the model, which will typically only be the case
+  // if the 'store-stats' code is activated
+  // (e.g. --optimization.store-stats=true) because it needs the stats.  To be
+  // activated this code also requires that is_gradient_ is false (i.e. you're
+  // not computing exact gradients).
+
   virtual void InitFromConfig(ConfigLine *cfl);
 
   /// We implement Read at this level as it just needs the Type().
@@ -458,7 +497,8 @@ class NonlinearComponent: public Component {
   double Count() const { return count_; }
 
  protected:
-  friend class NormalizationComponent;
+  enum { kUnsetThreshold = -1000 };
+
   friend class SigmoidComponent;
   friend class TanhComponent;
   friend class SoftmaxComponent;
@@ -479,6 +519,13 @@ class NonlinearComponent: public Component {
                                // (only applicable to element-by-element
                                // nonlinearities, not Softmax.
   double count_;
+
+
+  // some configuration values relating to self-repairing nonlinearities.
+  BaseFloat self_repair_lower_threshold_;
+  BaseFloat self_repair_upper_threshold_;
+  BaseFloat self_repair_scale_;
+
   // The mutex is used in UpdateStats, only for resizing vectors.
   Mutex mutex_;
 };
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index c1f8dda220f..61016c3d7ae 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -195,18 +195,28 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
     c_copy_scaled = c.Copy();  // This will test backprop with an updatable component.
     c_copy_scaled->Scale(0.5);
   }
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
       num_rows = RandInt(1, 100);
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim);
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kUndefined,
+                                 input_stride_type);
   input_data.SetRandn();
+  CuMatrix<BaseFloat> input_data_scaled(num_rows, input_dim, kUndefined,
+                                        input_stride_type),
+      output_data3(num_rows, input_dim, kSetZero,
+                   output_stride_type);
+  input_data_scaled.CopyFromMat(input_data);
+  output_data3.CopyFromMat(input_data);
   CuMatrix<BaseFloat>
-      input_data_scaled(input_data),
-      output_data1(num_rows, output_dim),
-      output_data2(num_rows, output_dim),
-      output_data3(input_data),
-      output_data4(num_rows, output_dim),
-      output_data5(num_rows, output_dim);
+      output_data1(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data2(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data4(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data5(num_rows, output_dim, kSetZero, output_stride_type);
   output_data2.Add(1.0);
   input_data_scaled.Scale(2.0);
 
@@ -240,11 +250,13 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
   }
 
 
-  CuMatrix<BaseFloat> output_deriv(num_rows, output_dim);
+  CuMatrix<BaseFloat> output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   output_deriv.SetRandn();
-  CuMatrix<BaseFloat> input_deriv1(num_rows, input_dim),
-      input_deriv2(num_rows, input_dim),
-      input_deriv3(output_deriv);
+  CuMatrix<BaseFloat> input_deriv1(num_rows, input_dim, kSetZero, input_stride_type),
+      input_deriv2(num_rows, input_dim, kSetZero, input_stride_type);
+  CuMatrix<BaseFloat> input_deriv3(num_rows, output_dim, kSetZero, input_stride_type);
+  input_deriv3.CopyFromMat(output_deriv);
+
   input_deriv2.Add(1.0);
   CuMatrix<BaseFloat> empty_mat;
 
@@ -283,19 +295,25 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
 
 bool TestSimpleComponentDataDerivative(const Component &c,
                                        BaseFloat perturb_delta) {
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
       num_rows = RandInt(1, 100);
   int32 properties = c.Properties();
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim),
-      output_data(num_rows, output_dim),
-      output_deriv(num_rows, output_dim);
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
+      output_data(num_rows, output_dim, kSetZero, output_stride_type),
+      output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
 
   c.Propagate(NULL, input_data, &output_data);
 
-  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim), empty_mat;
+  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim, kSetZero, input_stride_type),
+      empty_mat;
   c.Backprop("foobar", NULL,
              ((properties & kBackpropNeedsInput) ? input_data : empty_mat),
              ((properties & kBackpropNeedsOutput) ? output_data : empty_mat),
@@ -306,8 +324,10 @@ bool TestSimpleComponentDataDerivative(const Component &c,
   Vector<BaseFloat> measured_objf_change(test_dim),
       predicted_objf_change(test_dim);
   for (int32 i = 0; i < test_dim; i++) {
-    CuMatrix<BaseFloat> perturbed_input_data(num_rows, input_dim),
-        perturbed_output_data(num_rows, output_dim);
+    CuMatrix<BaseFloat> perturbed_input_data(num_rows, input_dim,
+                                             kSetZero, input_stride_type),
+        perturbed_output_data(num_rows, output_dim,
+                              kSetZero, output_stride_type);
     perturbed_input_data.SetRandn();
     perturbed_input_data.Scale(perturb_delta);
     // at this point, perturbed_input_data contains the offset at the input data.
@@ -353,10 +373,14 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     // nothing to test.
     return true;
   }
-
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim),
-      output_data(num_rows, output_dim),
-      output_deriv(num_rows, output_dim);
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
+      output_data(num_rows, output_dim, kSetZero, output_stride_type),
+      output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
 
@@ -374,7 +398,9 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     uc_copy->SetZero(is_gradient);
   }
 
-  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim), empty_mat;
+  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim,
+                                  kSetZero, input_stride_type),
+      empty_mat;
   c.Backprop("foobar", NULL,
              ((properties & kBackpropNeedsInput) ? input_data : empty_mat),
              ((properties & kBackpropNeedsOutput) ? output_data : empty_mat),
@@ -382,7 +408,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
              (RandInt(0, 1) == 0 ? &input_deriv : NULL));
 
   if (!test_derivative) { // Just testing that the model update is downhill.
-    CuMatrix<BaseFloat> new_output_data(num_rows, output_dim);
+    CuMatrix<BaseFloat> new_output_data(num_rows, output_dim,
+                                        kSetZero, output_stride_type);
     c_copy->Propagate(NULL, input_data, &new_output_data);
 
     BaseFloat new_objf = TraceMatMat(output_deriv, new_output_data, kTrans);
@@ -400,7 +427,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     Vector<BaseFloat> measured_objf_change(test_dim),
         predicted_objf_change(test_dim);
     for (int32 i = 0; i < test_dim; i++) {
-      CuMatrix<BaseFloat> perturbed_output_data(num_rows, output_dim);
+      CuMatrix<BaseFloat> perturbed_output_data(num_rows, output_dim,
+                                                kSetZero, output_stride_type);
       Component *c_perturbed = c.Copy();
       UpdatableComponent *uc_perturbed =
           dynamic_cast<UpdatableComponent*>(c_perturbed);
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index ebe8575e893..15d3abbe80a 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -327,6 +327,9 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
   int32 num_print = 10, num_not_computable = outputs_not_computable.size();
   KALDI_LOG << num_not_computable << " output cindexes out of "
             << num_outputs_total << " were not computable.";
+  std::ostringstream os;
+  request_.Print(os);
+  KALDI_LOG << "Computation request was: " << os.str();
   if (num_not_computable > num_print)
     KALDI_LOG << "Printing the reasons for " << num_print << " of these.";
   for (int32 i = 0; i < num_not_computable && i < num_print; i++)
@@ -369,8 +372,6 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // making more inputs available will never change something from not being
       // computable to being computable; or it could be a bug elsewhere.
       KALDI_ASSERT(ans);
-      std::vector<int32> &dependencies = graph_->dependencies[cindex_id];
-      std::sort(dependencies.begin(), dependencies.end());
       size_t size = used_cindexes.size();
       used_cindex_ids.resize(size);
       for (size_t i = 0; i < size; i++) {
@@ -386,7 +387,10 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
     case kComponent: {
       const Component *c = nnet_.GetComponent(node.u.component_index);
       bool dont_care = false;  // there should be no kUnknown, and we check this
-      IndexSet index_set(*graph_, computable_info_, node_id, dont_care);
+      // In the line below, node_id - 1 is the index of the component-input
+      // node-- the descriptor at the input to the component.  We are interested
+      // in the set of inputs to the component that are computable.
+      IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
       bool ans = c->IsComputable(request_.misc_info, index, index_set,
                                  &used_indexes);
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 6ecd4b9ccf4..85c58641778 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -3,6 +3,7 @@
 // nnet3/nnet-computation.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -124,27 +125,261 @@ int32 NnetComputation::NewSubMatrix(int32 base_submatrix,
   return ans;
 }
 
-int32 NnetComputation::NewMatrix(int32 num_rows, int32 num_cols) {
+int32 NnetComputation::NewMatrix(int32 num_rows, int32 num_cols,
+                                 MatrixStrideType stride_type) {
   KALDI_ASSERT(num_rows > 0 && num_cols > 0);
   if (matrices.empty()) {  // Set up the zero matrix; index zero is reserved.
-    matrices.push_back(MatrixInfo(0, 0));
+    matrices.push_back(MatrixInfo(0, 0, kDefaultStride));
     submatrices.push_back(SubMatrixInfo(0, 0, 0, 0, 0));
   }
   int32 matrix_index = matrices.size(),
       submatrix_index = submatrices.size();
-  matrices.push_back(MatrixInfo(num_rows, num_cols));
+  matrices.push_back(MatrixInfo(num_rows, num_cols, stride_type));
   if (!matrix_debug_info.empty())
     matrix_debug_info.push_back(MatrixDebugInfo());
   submatrices.push_back(SubMatrixInfo(matrix_index, 0, num_rows, 0, num_cols));
   return submatrix_index;
 }
 
+void NnetComputation::MatrixInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<MatrixInfo>");
+  ExpectToken(is, binary, "<NumRows>");
+  ReadBasicType(is, binary, &num_rows);
+  ExpectToken(is, binary, "<NumCols>");
+  ReadBasicType(is, binary, &num_cols);
+  ExpectToken(is, binary, "</MatrixInfo>");
+}
+
+void NnetComputation::MatrixInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MatrixInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumRows>");
+  WriteBasicType(os, binary, num_rows);
+  WriteToken(os, binary, "<NumCols>");
+  WriteBasicType(os, binary, num_cols);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</MatrixInfo>");
+  if (!binary) os << std::endl;
+}
+
 void NnetComputation::MatrixDebugInfo::Swap(
     NnetComputation::MatrixDebugInfo *other) {
   std::swap(is_deriv, other->is_deriv);
   cindexes.swap(other->cindexes);
 }
 
+void NnetComputation::MatrixDebugInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<MatrixDebugInfo>");
+  ExpectToken(is, binary, "<IsDeriv>");
+  ReadBasicType(is, binary, &is_deriv);
+  ExpectToken(is, binary, "<Cindexes>");
+  ReadCindexVector(is, binary, &cindexes);
+  ExpectToken(is, binary, "</MatrixDebugInfo>");
+}
+
+void NnetComputation::MatrixDebugInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MatrixDebugInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<IsDeriv>");
+  WriteBasicType(os, binary, is_deriv);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Cindexes>");
+  WriteCindexVector(os, binary, cindexes);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</MatrixDebugInfo>");
+  if (!binary) os << std::endl;
+}
+
+void NnetComputation::SubMatrixInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<SubMatrixInfo>");
+  ExpectToken(is, binary, "<MatrixIndex>");
+  ReadBasicType(is, binary, &matrix_index);
+  ExpectToken(is, binary, "<RowOffset>");
+  ReadBasicType(is, binary, &row_offset);
+  ExpectToken(is, binary, "<NumRows>");
+  ReadBasicType(is, binary, &num_rows);
+  ExpectToken(is, binary, "<ColOffset>");
+  ReadBasicType(is, binary, &col_offset);
+  ExpectToken(is, binary, "<NumCols>");
+  ReadBasicType(is, binary, &num_cols);
+  ExpectToken(is, binary, "</SubMatrixInfo>");
+}
+
+void NnetComputation::SubMatrixInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SubMatrixInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<MatrixIndex>");
+  WriteBasicType(os, binary, matrix_index);
+  WriteToken(os, binary, "<RowOffset>");
+  WriteBasicType(os, binary, row_offset);
+  WriteToken(os, binary, "<NumRows>");
+  WriteBasicType(os, binary, num_rows);
+  WriteToken(os, binary, "<ColOffset>");
+  WriteBasicType(os, binary, col_offset);
+  WriteToken(os, binary, "<NumCols>");
+  WriteBasicType(os, binary, num_cols);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</SubMatrixInfo>");
+  if (!binary) os << std::endl;
+}
+
+void NnetComputation::Command::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Command>");
+  ExpectToken(is, binary, "<CommandType>");
+  if (binary) {
+    int32 command_type_int;
+    ReadBasicType(is, binary, &command_type_int);
+    command_type = static_cast<CommandType>(command_type_int);
+  } else {
+    std::string command_type_str;
+    getline(is, command_type_str); 
+    if (command_type_str == "kAllocMatrixZeroed") {
+      command_type = kAllocMatrixZeroed;
+    } else if (command_type_str == "kAllocMatrixUndefined") {
+      command_type = kAllocMatrixUndefined;
+    } else if (command_type_str == "kDeallocMatrix") {
+      command_type = kDeallocMatrix;
+    } else if (command_type_str == "kAllocMatrixFromOther") {
+      command_type = kAllocMatrixFromOther;
+    } else if (command_type_str == "kAllocMatrixFromOtherZeroed") {
+      command_type = kAllocMatrixFromOtherZeroed;
+    } else if (command_type_str == "kPropagate") {
+      command_type = kPropagate;
+    } else if (command_type_str == "kStoreStats") {
+      command_type = kStoreStats;
+    } else if (command_type_str == "kBackprop") {
+      command_type = kBackprop;
+    } else if (command_type_str == "kBackpropNoModelUpdate") {
+      command_type = kBackpropNoModelUpdate;
+    } else if (command_type_str == "kMatrixCopy") {
+      command_type = kMatrixCopy;
+    } else if (command_type_str == "kMatrixAdd") {
+      command_type = kMatrixAdd;
+    } else if (command_type_str == "kCopyRows") {
+      command_type = kCopyRows;
+    } else if (command_type_str == "kAddRows") {
+      command_type = kAddRows;
+    } else if (command_type_str == "kCopyRowsMulti") {
+      command_type = kCopyRowsMulti;
+    } else if (command_type_str == "kCopyToRowsMulti") {
+      command_type = kCopyToRowsMulti;
+    } else if (command_type_str == "kAddRowsMulti") {
+      command_type = kAddRowsMulti;
+    } else if (command_type_str == "kAddToRowsMulti") {
+      command_type = kAddToRowsMulti;
+    } else if (command_type_str == "kAddRowRanges") {
+      command_type = kAddRowRanges;
+    } else if (command_type_str == "kNoOperation") {
+      command_type = kNoOperation;
+    } else if (command_type_str == "kNoOperationMarker") {
+      command_type = kNoOperationMarker;
+    } else {
+      KALDI_ERR << "Un-handled command type.";
+    }
+  }
+  ExpectToken(is, binary, "<Arg1>");
+  ReadBasicType(is, binary, &arg1);
+  ExpectToken(is, binary, "<Arg2>");
+  ReadBasicType(is, binary, &arg2);
+  ExpectToken(is, binary, "<Arg3>");
+  ReadBasicType(is, binary, &arg3);
+  ExpectToken(is, binary, "<Arg4>");
+  ReadBasicType(is, binary, &arg4);
+  ExpectToken(is, binary, "<Arg5>");
+  ReadBasicType(is, binary, &arg5);
+  ExpectToken(is, binary, "<Arg6>");
+  ReadBasicType(is, binary, &arg6);
+  ExpectToken(is, binary, "</Command>");
+}
+
+void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<Command>");
+  WriteToken(os, binary, "<CommandType>");
+  if (binary) {
+    WriteBasicType(os, binary, static_cast<int32>(command_type));
+  } else {
+    std::string command_type_str;
+    switch (command_type) {
+      case kAllocMatrixZeroed:
+        os << "kAllocMatrixZeroed\n";
+        break;
+      case kAllocMatrixUndefined:
+        os << "kAllocMatrixUndefined\n";
+        break;
+      case kDeallocMatrix:
+        os << "kDeallocMatrix\n";
+        break;
+      case kAllocMatrixFromOther:
+        os << "kAllocMatrixFromOther\n";
+        break;
+      case kAllocMatrixFromOtherZeroed:
+        os << "kAllocMatrixFromOtherZeroed\n";
+        break;
+      case kPropagate:
+        os << "kPropagate\n";
+        break;
+      case kStoreStats:
+        os << "kStoreStats\n";
+        break;
+      case kBackprop:
+        os << "kBackprop\n";
+        break;
+      case kBackpropNoModelUpdate:
+        os << "kBackpropNoModelUpdate\n";
+        break;
+      case kMatrixCopy:
+        os << "kMatrixCopy\n";
+        break;
+      case kMatrixAdd:
+        os << "kMatrixAdd\n";
+        break;
+      case kCopyRows:
+        os << "kCopyRows\n";
+        break;
+      case kAddRows:
+        os << "kAddRows\n";
+        break;
+      case kCopyRowsMulti:
+        os << "kCopyRowsMulti\n";
+        break;
+      case kCopyToRowsMulti:
+        os << "kCopyToRowsMulti\n";
+        break;
+      case kAddRowsMulti:
+        os << "kAddRowsMulti\n";
+        break;
+      case kAddToRowsMulti:
+        os << "kAddToRowsMulti\n";
+        break;
+      case kAddRowRanges:
+        os << "kAddRowRanges\n";
+        break;
+      case kNoOperation:
+        os << "kNoOperation\n";
+        break;
+      case kNoOperationMarker:
+        os << "kNoOperationMarker\n";
+        break;
+      default:
+        KALDI_ERR << "Un-handled command type.";
+    }
+  }
+  WriteToken(os, binary, "<Arg1>");
+  WriteBasicType(os, binary, arg1);
+  WriteToken(os, binary, "<Arg2>");
+  WriteBasicType(os, binary, arg2);
+  WriteToken(os, binary, "<Arg3>");
+  WriteBasicType(os, binary, arg3);
+  WriteToken(os, binary, "<Arg4>");
+  WriteBasicType(os, binary, arg4);
+  WriteToken(os, binary, "<Arg5>");
+  WriteBasicType(os, binary, arg5);
+  WriteToken(os, binary, "<Arg6>");
+  WriteBasicType(os, binary, arg6);
+  WriteToken(os, binary, "</Command>");
+}
+
+
 // outputs a string explaining the meaning each sub-matrix in vaguely
 // matlab-like notation: for whole matrices, something like "m1", "m2";
 // and for parts of matrices, "m1(0:10, 20:40)".
@@ -413,6 +648,209 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
   }
 }
 
+void NnetComputation::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetComputation>");
+  size_t num_matrices;
+  ExpectToken(is, binary, "<NumMatrices>");
+  ReadBasicType(is, binary, &num_matrices);
+  KALDI_ASSERT(num_matrices >= 0);
+  matrices.resize(num_matrices);
+  ExpectToken(is, binary, "<Matrices>");
+  for (size_t c = 0; c < num_matrices; c++) {
+    matrices[c].Read(is, binary);
+  }
+
+  size_t num_matrix_debug_info;
+  ExpectToken(is, binary, "<NumMatrixDebugInfo>");
+  ReadBasicType(is, binary, &num_matrix_debug_info);
+  KALDI_ASSERT(num_matrix_debug_info >= 0);
+  matrix_debug_info.resize(num_matrix_debug_info);
+  ExpectToken(is, binary, "<MatrixDebugInfo>");
+  for (size_t c = 0; c < num_matrix_debug_info; c++) {
+    matrix_debug_info[c].Read(is, binary);
+  }
+
+  size_t num_submatrices;
+  ExpectToken(is, binary, "<NumSubMatrices>");
+  ReadBasicType(is, binary, &num_submatrices);
+  KALDI_ASSERT(num_submatrices >= 0);
+  submatrices.resize(num_submatrices);
+  ExpectToken(is, binary, "<SubMatrices>");
+  for (size_t c = 0; c < num_submatrices; c++) {
+    submatrices[c].Read(is, binary);
+  }
+
+
+  size_t num_component_precomputed_indexes;
+  ExpectToken(is, binary, "<NumComponentPrecomputedIndexes>");
+  ReadBasicType(is, binary, &num_component_precomputed_indexes);
+  KALDI_ASSERT(num_component_precomputed_indexes >= 0);
+  component_precomputed_indexes.resize(num_component_precomputed_indexes);
+  ExpectToken(is, binary, "<ComponentPrecomputedIndexes>");
+  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes_tmp;
+  for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
+    bool is_null; // a boolean indicating whether the pointer should be NULL.
+    ReadBasicType(is, binary, &is_null); 
+    if (!is_null) {
+      ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
+      component_precomputed_indexes_tmp.push_back(p);
+    } else {
+      component_precomputed_indexes_tmp.push_back(NULL);
+    }
+  }
+  component_precomputed_indexes = component_precomputed_indexes_tmp;
+
+  size_t num_indexes;
+  ExpectToken(is, binary, "<NumIndexes>");
+  ReadBasicType(is, binary, &num_indexes);
+  KALDI_ASSERT(num_indexes >= 0);
+  indexes.resize(num_indexes);
+  ExpectToken(is, binary, "<Indexes>");
+  for (size_t c = 0; c < num_indexes; c++) {
+    ReadIntegerVector(is, binary, &(indexes[c]));
+  }
+
+  size_t num_indexes_multi;
+  ExpectToken(is, binary, "<NumIndexesMulti>");
+  ReadBasicType(is, binary, &num_indexes_multi);
+  KALDI_ASSERT(num_indexes_multi >= 0);
+  indexes_multi.resize(num_indexes_multi);
+  ExpectToken(is, binary, "<IndexesMulti>");
+  for (size_t c = 0; c < num_indexes_multi; c++) {
+    ReadIntegerPairVector(is, binary, &(indexes_multi[c]));
+  }
+
+  size_t num_indexes_ranges;
+  ExpectToken(is, binary, "<NumIndexesRanges>");
+  ReadBasicType(is, binary, &num_indexes_ranges);
+  KALDI_ASSERT(num_indexes_ranges >= 0);
+  indexes_ranges.resize(num_indexes_ranges);
+  ExpectToken(is, binary, "<IndexesRanges>");
+  for (size_t c = 0; c < num_indexes_ranges; c++) {
+    ReadIntegerPairVector(is, binary, &(indexes_ranges[c]));
+  }
+
+  size_t num_input_output_info;
+  ExpectToken(is, binary, "<NumInputOutputInfo>");
+  ReadBasicType(is, binary, &num_input_output_info);
+  KALDI_ASSERT(num_input_output_info >= 0);
+  input_output_info.clear();
+  ExpectToken(is, binary, "<InputOutputInfo>");
+  for (size_t c = 0; c < num_input_output_info; c++) {
+    int32 key;
+    std::pair<int32, int32> val;
+    ReadBasicType(is, binary, &key);
+    ReadBasicType(is, binary, &(val.first));
+    ReadBasicType(is, binary, &(val.second));
+    input_output_info.insert(std::make_pair<int32, std::pair<int32, int32> >(key, val));
+  }
+
+  size_t num_commands;
+  ExpectToken(is, binary, "<NumCommands>");
+  ReadBasicType(is, binary, &num_commands);
+  KALDI_ASSERT(num_commands >= 0);
+  commands.resize(num_commands);
+  ExpectToken(is, binary, "<Commands>");
+  for (size_t c = 0; c < num_commands; c++) {
+    commands[c].Read(is, binary);
+  }
+
+  ExpectToken(is, binary, "<NeedModelDerivative>");
+  ReadBasicType(is, binary, &need_model_derivative);
+
+  ComputeCudaIndexes();
+  ExpectToken(is, binary, "</NnetComputation>");
+}
+
+void NnetComputation::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NnetComputation>");
+  WriteToken(os, binary, "<NumMatrices>");
+  WriteBasicType(os, binary, matrices.size());
+  WriteToken(os, binary, "<Matrices>");
+  for (size_t c = 0; c < matrices.size(); c++) {
+    matrices[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumMatrixDebugInfo>");
+  WriteBasicType(os, binary, matrix_debug_info.size());
+  WriteToken(os, binary, "<MatrixDebugInfo>");
+  for (size_t c = 0; c < matrix_debug_info.size(); c++) {
+    matrix_debug_info[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumSubMatrices>");
+  WriteBasicType(os, binary, submatrices.size());
+  WriteToken(os, binary, "<SubMatrices>");
+  for (size_t c = 0; c < submatrices.size(); c++) {
+    submatrices[c].Write(os, binary);
+  }
+  
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumComponentPrecomputedIndexes>");
+  WriteBasicType(os, binary, component_precomputed_indexes.size());
+  WriteToken(os, binary, "<ComponentPrecomputedIndexes>");
+  for (size_t c = 0; c < component_precomputed_indexes.size(); c++) {
+    if (component_precomputed_indexes[c] != NULL) {
+      WriteBasicType(os, binary, false); // a boolean indicating whether the pointer is NULL.
+      component_precomputed_indexes[c]->Write(os, binary);
+    } else {
+      WriteBasicType(os, binary, true);
+    }
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexes>");
+  WriteBasicType(os, binary, indexes.size());
+  WriteToken(os, binary, "<Indexes>");
+  for (size_t c = 0; c < indexes.size(); c++) {
+    WriteIntegerVector(os, binary, indexes[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexesMulti>");
+  WriteBasicType(os, binary, indexes_multi.size());
+  WriteToken(os, binary, "<IndexesMulti>");
+  for (size_t c = 0; c < indexes_multi.size(); c++) {
+    WriteIntegerPairVector(os, binary, indexes_multi[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexesRanges>");
+  WriteBasicType(os, binary, indexes_ranges.size());
+  WriteToken(os, binary, "<IndexesRanges>");
+  for (size_t c = 0; c < indexes_ranges.size(); c++) {
+    WriteIntegerPairVector(os, binary, indexes_ranges[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumInputOutputInfo>");
+  WriteBasicType(os, binary, input_output_info.size());
+  WriteToken(os, binary, "<InputOutputInfo>");
+  std::map<int32, std::pair<int32, int32> > input_output_info_cp(input_output_info.begin(), input_output_info.end());
+  for (std::map<int32, std::pair<int32, int32> >::const_iterator iter =
+           input_output_info_cp.begin(); iter != input_output_info_cp.end(); ++iter) {
+    WriteBasicType(os, binary, iter->first);
+    WriteBasicType(os, binary, iter->second.first);
+    WriteBasicType(os, binary, iter->second.second);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumCommands>");
+  WriteBasicType(os, binary, commands.size());
+  WriteToken(os, binary, "<Commands>");
+  for (size_t c = 0; c < commands.size(); c++) {
+    commands[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NeedModelDerivative>");
+  WriteBasicType(os, binary, need_model_derivative);
+  WriteToken(os, binary, "</NnetComputation>");
+  if (!binary) os << std::endl;
+}
+
 void NnetComputation::GetCommandStrings(
     const Nnet &nnet,
     std::string *preamble,
@@ -473,7 +911,93 @@ void IoSpecification::Swap(IoSpecification *other) {
   name.swap(other->name);
   indexes.swap(other->indexes);
   std::swap(has_deriv, other->has_deriv);
-}  
+}
+
+void IoSpecification::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<IoSpecification>");
+  ReadToken(is, binary, &name);
+  ExpectToken(is, binary, "<NumIndexes>");
+  size_t num_indexes;
+  ReadBasicType(is, binary, &num_indexes);
+  ExpectToken(is, binary, "<Indexes>");
+  ReadIndexVector(is, binary, &indexes);
+  ExpectToken(is, binary, "<HasDeriv>");
+  ReadBasicType(is, binary, &has_deriv);
+  ExpectToken(is, binary, "</IoSpecification>");
+}
+
+void IoSpecification::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<IoSpecification>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, name);
+  WriteToken(os, binary, "<NumIndexes>");
+  WriteBasicType(os, binary, indexes.size());
+  WriteToken(os, binary, "<Indexes>");
+  WriteIndexVector(os, binary, indexes);
+  WriteToken(os, binary, "<HasDeriv>");
+  WriteBasicType(os, binary, has_deriv);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</IoSpecification>");
+  if (!binary) os << std::endl;
+}
+
+void ComputationRequest::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<ComputationRequest>");
+  size_t num_inputs;
+  ExpectToken(is, binary, "<NumInputs>");
+  ReadBasicType(is, binary, &num_inputs);
+  KALDI_ASSERT(num_inputs >= 0);
+  inputs.resize(num_inputs);
+  ExpectToken(is, binary, "<Inputs>");
+  for (size_t c = 0; c < num_inputs; c++) {
+    inputs[c].Read(is, binary);
+  }
+
+  size_t num_outputs;
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &num_outputs);
+  KALDI_ASSERT(num_outputs >= 0);
+  outputs.resize(num_outputs);
+  ExpectToken(is, binary, "<Outputs>");
+  for (size_t c = 0; c < num_outputs; c++) {
+    outputs[c].Read(is, binary);
+  }
+
+  ExpectToken(is, binary, "<NeedModelDerivative>");
+  ReadBasicType(is, binary, &need_model_derivative);
+  ExpectToken(is, binary, "<StoreComponentStats>");
+  ReadBasicType(is, binary, &store_component_stats);
+  ExpectToken(is, binary, "</ComputationRequest>");
+}
+
+void ComputationRequest::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ComputationRequest>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumInputs>");
+  WriteBasicType(os, binary, inputs.size());
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Inputs>");
+  for (size_t c = 0; c < inputs.size(); c++) {
+    inputs[c].Write(os, binary);
+  }
+  if (!binary) os << std::endl;
+
+  WriteToken(os, binary, "<NumOutputs>");
+  WriteBasicType(os, binary, outputs.size());
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Outputs>");
+  for (size_t c = 0; c < outputs.size(); c++) {
+    outputs[c].Write(os, binary);
+  }
+  if (!binary) os << std::endl;
+
+  WriteToken(os, binary, "<NeedModelDerivative>");
+  WriteBasicType(os, binary, need_model_derivative);
+  WriteToken(os, binary, "<StoreComponentStats>");
+  WriteBasicType(os, binary, store_component_stats);
+  WriteToken(os, binary, "</ComputationRequest>");
+  if (!binary) os << std::endl;
+}
 
 void ComputationRequest::Print(std::ostream &os) const {
   os << " # Computation request:\n";
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 80340739fc5..0d0b13547bf 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-computation.h
 
 // Copyright   2012-2015  Johns Hopkins University (author: Daniel Povey)
+//             2015       Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -89,6 +90,10 @@ struct IoSpecification {
 
   void Swap(IoSpecification *other);
 
+  void Read(std::istream &istream, bool binary);
+  
+  void Write(std::ostream &ostream, bool binary) const;
+  
   bool operator== (const IoSpecification &other) const;
 };
 
@@ -139,6 +144,10 @@ struct ComputationRequest {
   /// in a human-readable way.
   void Print(std::ostream &os) const;
 
+  void Read(std::istream &istream, bool binary);
+
+  void Write(std::ostream &ostream, bool binary) const;
+  
   bool operator== (const ComputationRequest &other) const;
 };
 
@@ -214,15 +223,21 @@ struct NnetComputation {
   struct MatrixInfo {
     int32 num_rows;
     int32 num_cols;
+    MatrixStrideType stride_type;
     MatrixInfo() { }
-    MatrixInfo(int32 num_rows, int32 num_cols):
-        num_rows(num_rows), num_cols(num_cols) {}
+    MatrixInfo(int32 num_rows, int32 num_cols,
+               MatrixStrideType stride_type):
+        num_rows(num_rows), num_cols(num_cols), stride_type(stride_type) {}
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
   struct MatrixDebugInfo {
     bool is_deriv;  // true if this represents a derivative, not a value.
     std::vector<Cindex> cindexes;
     MatrixDebugInfo(): is_deriv(false) { }
     void Swap(MatrixDebugInfo *other);  // Shallow swap
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
   struct SubMatrixInfo {
     int32 matrix_index;  // index into "matrices": the underlying matrix.
@@ -235,6 +250,8 @@ struct NnetComputation {
                   int32 col_offset, int32 num_cols):
         matrix_index(matrix_index), row_offset(row_offset), num_rows(num_rows),
         col_offset(col_offset), num_cols(num_cols) {}
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
     bool operator== (const SubMatrixInfo &other) const;
   };
   struct Command {
@@ -250,6 +267,8 @@ struct NnetComputation {
             int32 arg5 = -1, int arg6 = -1):
         command_type(command_type), arg1(arg1), arg2(arg2), arg3(arg3),
         arg4(arg4), arg5(arg5), arg6(arg6) { }
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
 
   // "matrices" describes the sizes of the matrices that we use as variables in
@@ -316,7 +335,7 @@ struct NnetComputation {
   /// 'this->matrices' and 'this->submatrices'; and if 'this->matrix_debug_info'
   /// is nonempty, also increases its size by one.  Returns the *sub-matrix*
   /// index corresponding to the newly added matrix.
-  int32 NewMatrix(int32 num_rows, int32 num_cols);
+  int32 NewMatrix(int32 num_rows, int32 num_cols, MatrixStrideType stride_type);
 
   /// Convenience function used when adding new sub-matrices.  base_submatrix is
   /// the submatrix of which we want a column and/or row range.  As a
@@ -341,6 +360,8 @@ struct NnetComputation {
   // interpret the computation.
   void Print(std::ostream &os, const Nnet &nnet) const;
 
+  void Read(std::istream &istream, bool binary);
+  void Write(std::ostream &ostream, bool binary) const;
 
   // This function outputs a vector of strings, one for each submatrix,
   // that explains the meaning of each one: something like "m1", "m2";
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index ab0411206c6..7fdb3dab982 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-compute-test.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+//           2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -27,6 +28,48 @@
 namespace kaldi {
 namespace nnet3 {
 
+void UnitTestNnetComputationIo(NnetComputation *computation) {
+  bool binary = (Rand() % 2 == 0);
+  std::ostringstream os;
+  computation->Write(os, binary);
+  const std::string &original_output = os.str();
+  std::istringstream computation_is(original_output);
+  computation->Read(computation_is, binary);
+  std::istringstream computation_is2(original_output);
+  NnetComputation computation2;
+  computation2.Read(computation_is2, binary);
+
+  std::ostringstream os2, os3;
+  computation->Write(os2, binary);
+  computation2.Write(os3, binary);
+
+  if (binary) {
+    KALDI_ASSERT(os2.str() == original_output);
+    KALDI_ASSERT(os3.str() == original_output);
+  }
+}
+
+void UnitTestComputationRequestIo(ComputationRequest *request) {
+  bool binary = (Rand() % 2 == 0);
+  std::ostringstream os;
+  request->Write(os, binary);
+  const std::string &original_output = os.str();
+  std::istringstream request_is(original_output);
+  request->Read(request_is, binary);
+  std::istringstream request_is2(original_output);
+  ComputationRequest request2;
+  request2.Read(request_is2, binary);
+
+  std::ostringstream os2, os3;
+  request->Write(os2, binary);
+  request2.Write(os3, binary);
+  KALDI_ASSERT(*request == request2);
+
+  if (binary) {
+    KALDI_ASSERT(os2.str() == original_output);
+    KALDI_ASSERT(os3.str() == original_output);
+  }
+}
 
 void TestNnetDecodable(const ComputationRequest &request,
                        const std::vector<Matrix<BaseFloat> > &inputs,
@@ -38,10 +81,10 @@ void TestNnetDecodable(const ComputationRequest &request,
 }
 
 void UnitTestNnetCompute() {
-  for (int32 n = 0; n < 20; n++) {    
+  for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
 
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -54,7 +97,7 @@ void UnitTestNnetCompute() {
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -64,10 +107,12 @@ void UnitTestNnetCompute() {
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Generated computation is: " << os.str();
+      UnitTestNnetComputationIo(&computation);
+      UnitTestComputationRequestIo(&request);
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
+    check_config.check_rewrite = true;
     ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
 
@@ -85,7 +130,7 @@ void UnitTestNnetCompute() {
     NnetComputeOptions compute_opts;
     if (RandInt(0, 1) == 0)
       compute_opts.debug = true;
-    
+
     computation.ComputeCudaIndexes();
     NnetComputer computer(compute_opts,
                           computation,
@@ -101,7 +146,7 @@ void UnitTestNnetCompute() {
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
 
     TestNnetDecodable(request, inputs, nnet, output);
-    
+
     KALDI_LOG << "Output sum is " << output.Sum();
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index b5263240a38..c6d17cbb589 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -150,12 +150,14 @@ void NnetComputer::ExecuteCommand(int32 command) {
       case kAllocMatrixZeroed:
         matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
                                  computation_.matrices[c.arg1].num_cols,
-                                 kSetZero);
+                                 kSetZero,
+                                 computation_.matrices[c.arg1].stride_type);
         break;
       case kAllocMatrixUndefined:
         matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
                                  computation_.matrices[c.arg1].num_cols,
-                                 kUndefined);
+                                 kUndefined,
+                                 computation_.matrices[c.arg1].stride_type);
         break;
       case kDeallocMatrix:
         matrices_[c.arg1].Resize(0, 0);
@@ -383,22 +385,31 @@ void NnetComputer::Backward() {
 }
 
 void NnetComputer::AcceptInput(const std::string &input_name,
-                             CuMatrix<BaseFloat> *input) {
+                               CuMatrix<BaseFloat> *input) {
   bool is_output = false, is_deriv = false;
   int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-
   KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  if (input->NumRows() != computation_.matrices[matrix_index].num_rows)
+  const NnetComputation::MatrixInfo &matrix_info =
+      computation_.matrices[matrix_index];
+  if (input->NumRows() != matrix_info.num_rows)
     KALDI_ERR << "Num-rows mismatch for input '" << input_name
-              << "': " << computation_.matrices[matrix_index].num_rows
+              << "': " << matrix_info.num_rows
               <<  " in computation-request, " << input->NumRows()
               << " provided.";
-  if (input->NumCols() != computation_.matrices[matrix_index].num_cols)
+  if (input->NumCols() != matrix_info.num_cols)
     KALDI_ERR << "Num-cols mismatch for input '" << input_name
-              << "': " << computation_.matrices[matrix_index].num_cols
+              << "': " << matrix_info.num_cols
               <<  " in computation-request, " << input->NumCols()
               << " provided.";
-  matrices_[matrix_index].Swap(input);
+  if (matrix_info.stride_type == kDefaultStride ||
+      input->Stride() == input->NumCols()) {
+    matrices_[matrix_index].Swap(input);
+  } else {
+    matrices_[matrix_index].Resize(matrix_info.num_rows,
+                                   matrix_info.num_cols,
+                                   kUndefined, kStrideEqualNumCols);
+    matrices_[matrix_index].CopyFromMat(*input);
+  }
   input->Resize(0, 0);
 }
 
@@ -438,17 +449,27 @@ void NnetComputer::AcceptOutputDeriv(const std::string &output_name,
   bool is_output = true, is_deriv = true;
   int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
   KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  if (output_deriv->NumRows() != computation_.matrices[matrix_index].num_rows)
+  const NnetComputation::MatrixInfo &matrix_info =
+      computation_.matrices[matrix_index];
+  if (output_deriv->NumRows() != matrix_info.num_rows)
     KALDI_ERR << "Num-rows mismatch for output-deriv '" << output_name
-              << "': " << computation_.matrices[matrix_index].num_rows
+              << "': " << matrix_info.num_rows
               <<  " in computation-request, " << output_deriv->NumRows()
               << " provided.";
-  if (output_deriv->NumCols() != computation_.matrices[matrix_index].num_cols)
+  if (output_deriv->NumCols() != matrix_info.num_cols)
     KALDI_ERR << "Num-cols mismatch for output_deriv '" << output_name
-              << "': " << computation_.matrices[matrix_index].num_cols
+              << "': " << matrix_info.num_cols
               <<  " in computation-request, " << output_deriv->NumCols()
               << " provided.";
-  matrices_[matrix_index].Swap(output_deriv);
+  if (matrix_info.stride_type == kDefaultStride ||
+      output_deriv->Stride() == output_deriv->NumCols()) {
+    matrices_[matrix_index].Swap(output_deriv);
+  } else {
+    matrices_[matrix_index].Resize(matrix_info.num_rows,
+                                   matrix_info.num_cols,
+                                   kUndefined, kStrideEqualNumCols);
+    matrices_[matrix_index].CopyFromMat(*output_deriv);
+  }
   output_deriv->Resize(0, 0);
 }
 
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index a6bef58e6fc..298548857dd 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -34,7 +34,6 @@ namespace nnet3 {
 struct SimpleObjectiveInfo {
   double tot_weight;
   double tot_objective;
-
   SimpleObjectiveInfo(): tot_weight(0.0),
                          tot_objective(0.0) { }
 
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
new file mode 100644
index 00000000000..10f0811c12e
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -0,0 +1,209 @@
+// nnet3/nnet-discriminative-diagnostics.cc
+
+// Copyright  2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright  2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
+    const NnetComputeProbOptions &nnet_config,
+    const discriminative::DiscriminativeOptions &discriminative_config,
+    const TransitionModel &tmodel,
+    const VectorBase<BaseFloat> &priors,
+    const Nnet &nnet):
+    nnet_config_(nnet_config),
+    discriminative_config_(discriminative_config),
+    tmodel_(tmodel),
+    log_priors_(priors),
+    nnet_(nnet),
+    compiler_(nnet, nnet_config_.optimize_config),
+    deriv_nnet_(NULL),
+    num_minibatches_processed_(0) {
+  log_priors_.ApplyLog();
+  if (nnet_config_.compute_deriv) {
+    deriv_nnet_ = new Nnet(nnet_);
+    bool is_gradient = true;  // force simple update
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+const Nnet& NnetDiscriminativeComputeObjf::GetDeriv() const {
+  if (deriv_nnet_ == NULL)
+    KALDI_ERR << "GetDeriv() called when no derivatives were requested.";
+  return *deriv_nnet_;
+}
+
+NnetDiscriminativeComputeObjf::~NnetDiscriminativeComputeObjf() {
+  delete deriv_nnet_;  // delete does nothing if pointer is NULL.
+}
+
+void NnetDiscriminativeComputeObjf::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  if (deriv_nnet_) {
+    bool is_gradient = true;
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = nnet_config_.compute_deriv,
+      store_component_stats = false;
+  bool use_xent_regularization = (discriminative_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
+
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(nnet_, eg, 
+                                      need_model_derivative,
+                                      store_component_stats,
+                                      use_xent_regularization, use_xent_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+  NnetComputer computer(nnet_config_.compute_config, *computation,
+                        nnet_, deriv_nnet_);
+  // give the inputs to the computer object.
+  computer.AcceptInputs(nnet_, eg.inputs);
+  computer.Forward();
+  this->ProcessOutputs(eg, &computer);
+  if (nnet_config_.compute_deriv)
+    computer.Backward();
+}
+
+void NnetDiscriminativeComputeObjf::ProcessOutputs(
+                                    const NnetDiscriminativeExample &eg,
+                                    NnetComputer *computer) {
+  // There will normally be just one output here, named 'output',
+  // but the code is more general than this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_.GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_.IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+    
+    bool use_xent = (discriminative_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
+
+    if (nnet_config_.compute_deriv)
+      nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+    
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    if (objf_info_.count(sup.name) == 0)
+      objf_info_.insert(std::make_pair(sup.name, 
+          discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
+
+    discriminative::DiscriminativeObjectiveInfo *stats = &(objf_info_[sup.name]);
+
+    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_, 
+                                                      tmodel_, log_priors_,
+                                                      sup.supervision, nnet_output,
+                                                      stats,
+                                                      (nnet_config_.compute_deriv ?
+                                                       &nnet_output_deriv : NULL),
+                                                      (use_xent ? &xent_deriv : NULL));
+
+    if (nnet_config_.compute_deriv)
+      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    
+    if (use_xent) {
+      if (objf_info_.count(xent_name) == 0)
+        objf_info_.insert(std::make_pair(xent_name, 
+          discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
+      discriminative::DiscriminativeObjectiveInfo &xent_stats = objf_info_[xent_name];
+
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of 'supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_stats.tot_t_weighted += stats->tot_t_weighted;
+      xent_stats.tot_objf += xent_objf;
+    }
+    
+    num_minibatches_processed_++;
+  }
+}
+
+bool NnetDiscriminativeComputeObjf::PrintTotalStats() const {
+  bool ans = false;
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter, end;
+  iter = objf_info_.begin();
+  end = objf_info_.end();
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    int32 node_index = nnet_.GetNodeIndex(name);
+    KALDI_ASSERT(node_index >= 0);
+    const discriminative::DiscriminativeObjectiveInfo &info = iter->second;
+    BaseFloat tot_weight = info.tot_t_weighted;
+    BaseFloat tot_objective = info.TotalObjf(
+        discriminative_config_.criterion);
+    
+    info.PrintAll(discriminative_config_.criterion);
+
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall " << discriminative_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall " << discriminative_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " + " << (info.tot_l2_term / tot_weight)
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    }
+
+    if (tot_weight > 0)
+      ans = true;
+  }
+  return ans;
+}
+
+const discriminative::DiscriminativeObjectiveInfo* NnetDiscriminativeComputeObjf::GetObjective(
+    const std::string &output_name) const {
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter = objf_info_.find(output_name);
+  if (iter != objf_info_.end())
+    return &(iter->second);
+  else
+    return NULL;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-diagnostics.h b/src/nnet3/nnet-discriminative-diagnostics.h
new file mode 100644
index 00000000000..3bcae8fac30
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.h
@@ -0,0 +1,88 @@
+// nnet3/nnet-discriminative-diagnostics.h
+
+// Copyright    2012-2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright    2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/** This class is for computing objective-function values in a nnet3 
+    discriminative training, for diagnostics.  It also supports computing model
+    derivatives.
+ */
+class NnetDiscriminativeComputeObjf {
+ public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
+  NnetDiscriminativeComputeObjf(const NnetComputeProbOptions &nnet_config,
+      const discriminative::DiscriminativeOptions &discriminative_config,
+      const TransitionModel &tmodel,
+      const VectorBase<BaseFloat> &priors,
+      const Nnet &nnet);
+
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
+  // compute objective on one minibatch.
+  void Compute(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  // returns the objective-function info for this output name (e.g. "output"),
+  // or NULL if there is no such info.
+  const discriminative::DiscriminativeObjectiveInfo *GetObjective(
+      const std::string &output_name) const;
+
+  // if config.compute_deriv == true, returns a reference to the
+  // computed derivative.  Otherwise crashes.
+  const Nnet &GetDeriv() const;
+  
+  ~NnetDiscriminativeComputeObjf();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  NnetComputeProbOptions nnet_config_;
+
+  discriminative::DiscriminativeOptions discriminative_config_;
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  Nnet *deriv_nnet_;
+  int32 num_minibatches_processed_;  // this is only for diagnostics
+
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher> objf_info_;
+};
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
new file mode 100644
index 00000000000..e9a063e268e
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -0,0 +1,419 @@
+// nnet3/nnet-discriminative-example.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void NnetDiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  CheckDim();
+  WriteToken(os, binary, "<NnetDiscriminativeSup>");
+  WriteToken(os, binary, name);
+  WriteIndexVector(os, binary, indexes);
+  supervision.Write(os, binary);
+  WriteToken(os, binary, "<DW>");  // for DerivWeights.  Want to save space.
+  WriteVectorAsChar(os, binary, deriv_weights);
+  WriteToken(os, binary, "</NnetDiscriminativeSup>");
+}
+
+bool NnetDiscriminativeSupervision::operator == (const NnetDiscriminativeSupervision &other) const {
+  return name == other.name && indexes == other.indexes &&
+      supervision == other.supervision &&
+      deriv_weights.ApproxEqual(other.deriv_weights);
+}
+
+void NnetDiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetDiscriminativeSup>");
+  ReadToken(is, binary, &name);
+  ReadIndexVector(is, binary, &indexes);
+  supervision.Read(is, binary);
+  ExpectToken(is, binary, "<DW>");
+  ReadVectorAsChar(is, binary, &deriv_weights);
+  ExpectToken(is, binary, "</NnetDiscriminativeSup>");
+  CheckDim();
+}
+
+
+void NnetDiscriminativeSupervision::CheckDim() const {
+  if (supervision.frames_per_sequence == -1) {
+    // this object has not been set up.
+    KALDI_ASSERT(indexes.empty());
+    return;
+  }
+  KALDI_ASSERT(indexes.size() == supervision.num_sequences *
+               supervision.frames_per_sequence && !indexes.empty() &&
+               supervision.frames_per_sequence > 1);
+  int32 first_frame = indexes[0].t,
+      frame_skip = indexes[supervision.num_sequences].t - first_frame,
+      num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  int32 k = 0;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      int32 n = j, t = i * frame_skip + first_frame, x = 0;
+      Index index(n, t, x);
+      KALDI_ASSERT(indexes[k] == index);
+    }
+  }
+  if (deriv_weights.Dim() != 0) {
+    KALDI_ASSERT(deriv_weights.Dim() == indexes.size());
+    KALDI_ASSERT(deriv_weights.Min() >= 0.0 &&
+                 deriv_weights.Max() <= 1.0);
+  }
+}
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other):
+    name(other.name),
+    indexes(other.indexes),
+    supervision(other.supervision),
+    deriv_weights(other.deriv_weights) { CheckDim(); }
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(
+    const std::string &name,
+    const discriminative::DiscriminativeSupervision &supervision,
+    const Vector<BaseFloat> &deriv_weights,
+    int32 first_frame,
+    int32 frame_skip):
+    name(name),
+    supervision(supervision),
+    deriv_weights(deriv_weights) {
+  // note: this will set the 'x' index to zero.
+  indexes.resize(supervision.num_sequences *
+                 supervision.frames_per_sequence);
+  int32 k = 0, num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      indexes[k].n = j;
+      indexes[k].t = i * frame_skip + first_frame;
+    }
+  }
+  KALDI_ASSERT(k == indexes.size());
+  CheckDim();
+}
+
+void NnetDiscriminativeSupervision::Swap(NnetDiscriminativeSupervision *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  supervision.Swap(&(other->supervision));
+  deriv_weights.Swap(&(other->deriv_weights));
+  if (RandInt(0, 5) == 0)
+    CheckDim();
+}
+
+
+void NnetDiscriminativeExample::Write(std::ostream &os, bool binary) const {
+  // Note: weight, label, input_frames and spk_info are members.  This is a
+  // struct.
+  WriteToken(os, binary, "<Nnet3DiscriminativeEg>");
+  WriteToken(os, binary, "<NumInputs>");
+  int32 size = inputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no inputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    inputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "<NumOutputs>");
+  size = outputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no outputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    outputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Nnet3DiscriminativeEg>");
+  ExpectToken(is, binary, "<NumInputs>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  inputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    inputs[i].Read(is, binary);
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  outputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    outputs[i].Read(is, binary);
+  ExpectToken(is, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Swap(NnetDiscriminativeExample *other) {
+  inputs.swap(other->inputs);
+  outputs.swap(other->outputs);
+}
+
+void NnetDiscriminativeExample::Compress() {
+  std::vector<NnetIo>::iterator iter = inputs.begin(), end = inputs.end();
+  // calling features.Compress() will do nothing if they are sparse or already
+  // compressed.
+  for (; iter != end; ++iter) iter->features.Compress();
+}
+
+NnetDiscriminativeExample::NnetDiscriminativeExample(const NnetDiscriminativeExample &other):
+    inputs(other.inputs), outputs(other.outputs) { }
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output) {
+  int32 num_inputs = inputs.size(),
+      num_indexes = 0;
+  for (int32 n = 0; n < num_inputs; n++) {
+    KALDI_ASSERT(inputs[n]->name == inputs[0]->name);
+    num_indexes += inputs[n]->indexes.size();
+  }
+  output->name = inputs[0]->name;
+  std::vector<const discriminative::DiscriminativeSupervision*> input_supervision;
+  input_supervision.reserve(inputs.size());
+  for (int32 n = 0; n < num_inputs; n++)
+    input_supervision.push_back(&(inputs[n]->supervision));
+  std::vector<discriminative::DiscriminativeSupervision> output_supervision;
+  bool compactify = true;
+  discriminative::AppendSupervision(input_supervision,
+                         compactify,
+                         &output_supervision);
+  if (output_supervision.size() != 1)
+    KALDI_ERR << "Failed to merge discriminative examples-- inconsistent lengths "
+              << "or weights?";
+  output->supervision.Swap(&(output_supervision[0]));
+
+  output->indexes.clear();
+  output->indexes.reserve(num_indexes);
+  for (int32 n = 0; n < num_inputs; n++) {
+    const std::vector<Index> &src_indexes = inputs[n]->indexes;
+    int32 cur_size = output->indexes.size();
+    output->indexes.insert(output->indexes.end(),
+                           src_indexes.begin(), src_indexes.end());
+    std::vector<Index>::iterator iter = output->indexes.begin() + cur_size,
+        end = output->indexes.end();
+    // change the 'n' index to correspond to the index into 'input'.
+    // Each example gets a different 'n' value, starting from 0.
+    for (; iter != end; ++iter) {
+      KALDI_ASSERT(iter->n == 0 && "Merging already-merged discriminative egs");
+      iter->n = n;
+    }
+  }
+  KALDI_ASSERT(output->indexes.size() == num_indexes);
+  // OK, at this point the 'indexes' will be in the wrong order,
+  // because they should be first sorted by 't' and next by 'n'.
+  // 'sort' will fix this, due to the operator < on type Index.
+  // TODO: Is this required?
+  std::sort(output->indexes.begin(), output->indexes.end());
+
+  // merge the deriv_weights.
+  if (inputs[0]->deriv_weights.Dim() != 0) {
+    int32 frames_per_sequence = inputs[0]->deriv_weights.Dim();
+    output->deriv_weights.Resize(output->indexes.size(), kUndefined);
+    KALDI_ASSERT(output->deriv_weights.Dim() ==
+                 frames_per_sequence * num_inputs);
+    for (int32 n = 0; n < num_inputs; n++) {
+      const Vector<BaseFloat> &src_deriv_weights = inputs[n]->deriv_weights;
+      KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence);
+      // the ordering of the deriv_weights corresponds to the ordering of the
+      // Indexes, where the time dimension has the greater stride.
+      for (int32 t = 0; t < frames_per_sequence; t++) {
+        output->deriv_weights(t * num_inputs + n) = src_deriv_weights(t);
+      }
+    }
+  }
+  output->CheckDim();
+}
+
+
+void MergeDiscriminativeExamples(bool compress,
+                        std::vector<NnetDiscriminativeExample> *input,
+                        NnetDiscriminativeExample *output) {
+  int32 num_examples = input->size();
+  KALDI_ASSERT(num_examples > 0);
+  // we temporarily make the input-features in 'input' look like regular NnetExamples,
+  // so that we can recycle the MergeExamples() function.
+  std::vector<NnetExample> eg_inputs(num_examples);
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  NnetExample eg_output;
+  MergeExamples(eg_inputs, compress, &eg_output);
+  // swap the inputs back so that they are not really changed.
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  // write to 'output->inputs'
+  eg_output.io.swap(output->inputs);
+
+  // Now deal with the discriminative-supervision 'outputs'.  There will
+  // normally be just one of these, with name "output", but we
+  // handle the more general case.
+  int32 num_output_names = (*input)[0].outputs.size();
+  output->outputs.resize(num_output_names);
+  for (int32 i = 0; i < num_output_names; i++) {
+    std::vector<const NnetDiscriminativeSupervision*> to_merge(num_examples);
+    for (int32 j = 0; j < num_examples; j++) {
+      KALDI_ASSERT((*input)[j].outputs.size() == num_output_names);
+      to_merge[j] = &((*input)[j].outputs[i]);
+    }
+    MergeSupervision(to_merge,
+                     &(output->outputs[i]));
+  }
+}
+
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg) {
+  for (size_t i = 0; i < eg->outputs.size(); i++) {
+    NnetDiscriminativeSupervision &supervision = eg->outputs[i];
+    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
+    if (deriv_weights.Dim() == 0) {
+      deriv_weights.Resize(supervision.indexes.size());
+      deriv_weights.Set(1.0);
+    }
+    int32 num_sequences = supervision.supervision.num_sequences,
+       frames_per_sequence = supervision.supervision.frames_per_sequence;
+    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
+    for (int32 t = 0; t < truncate; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+    for (int32 t = frames_per_sequence - truncate;
+         t < frames_per_sequence; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+  }
+}
+
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.inputs.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.outputs.size());
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+  for (size_t i = 0; i < eg.inputs.size(); i++) {
+    const NnetIo &io = eg.inputs[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsInputNode(node_index))
+      KALDI_ERR << "Nnet example has input named '" << name
+                << "', but no such input node is in the network.";
+
+    request->inputs.resize(request->inputs.size() + 1);
+    IoSpecification &io_spec = request->inputs.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = false;
+  }
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    // there will normally be exactly one output , named "output"
+    const NnetDiscriminativeSupervision &sup = eg.outputs[i];
+    const std::string &name = sup.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsOutputNode(node_index))
+      KALDI_ERR << "Nnet example has output named '" << name
+                << "', but no such output node is in the network.";
+    request->outputs.resize(request->outputs.size() + 1);
+    IoSpecification &io_spec = request->outputs.back();
+    io_spec.name = name;
+    io_spec.indexes = sup.indexes;
+    io_spec.has_deriv = need_model_derivative;
+    
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
+  }
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
+
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                            const std::vector<std::string> &exclude_names,
+                            NnetDiscriminativeExample *eg) {
+  std::vector<NnetIo>::iterator input_iter = eg->inputs.begin(),
+      input_end = eg->inputs.end();
+  for (; input_iter != input_end; ++input_iter) {
+    bool must_exclude = false;
+    std::vector<string>::const_iterator exclude_iter = exclude_names.begin(),
+        exclude_end = exclude_names.end();
+    for (; exclude_iter != exclude_end; ++exclude_iter)
+      if (input_iter->name == *exclude_iter)
+        must_exclude = true;
+    if (!must_exclude) {
+      std::vector<Index>::iterator indexes_iter = input_iter->indexes.begin(),
+          indexes_end = input_iter->indexes.end();
+      for (; indexes_iter != indexes_end; ++indexes_iter)
+        indexes_iter->t += frame_shift;
+    }
+  }
+  // note: we'll normally choose a small enough shift that the output-data
+  // shift will be zero after dividing by frame_subsampling_factor
+  // (e.g. frame_subsampling_factor == 3 and shift = 0 or 1.
+  std::vector<NnetDiscriminativeSupervision>::iterator
+      sup_iter = eg->outputs.begin(),
+      sup_end = eg->outputs.end();
+  for (; sup_iter != sup_end; ++sup_iter) {
+    std::vector<Index> &indexes = sup_iter->indexes;
+    KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n &&
+                 indexes[0].x == indexes[1].x);
+    int32 frame_subsampling_factor = indexes[1].t - indexes[0].t;
+    KALDI_ASSERT(frame_subsampling_factor > 0);
+
+    // We need to shift by a multiple of frame_subsampling_factor.
+    // Round to the closest multiple.
+    int32 supervision_frame_shift =
+        frame_subsampling_factor *
+        std::floor(0.5 + (frame_shift * 1.0 / frame_subsampling_factor));
+    if (supervision_frame_shift == 0)
+      continue;
+    std::vector<Index>::iterator indexes_iter = indexes.begin(),
+        indexes_end = indexes.end();
+    for (; indexes_iter != indexes_end; ++indexes_iter)
+      indexes_iter->t += supervision_frame_shift;
+  }
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
new file mode 100644
index 00000000000..b2458b0cdcd
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -0,0 +1,206 @@
+// nnet3/nnet-discriminative-example.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-computation.h"
+#include "util/table-types.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Glossary: mmi = Maximum Mutual Information,
+//          mpfe = Minimum Phone Frame Error
+//          smbr = State-level Minimum Bayes Risk
+
+// This file relates to the creation of examples for discriminative training
+
+struct NnetDiscriminativeSupervision {
+  // the name of the output in the neural net; in simple setups it
+  // will just be "output".
+  std::string name;
+  
+  // The indexes that the output corresponds to.  The size of this vector will
+  // be equal to supervision.num_sequences * supervision.frames_per_sequence.
+  // Be careful about the order of these indexes-- it is a little confusing.
+  // The indexes in the 'index' vector are ordered as: (frame 0 of each sequence);
+  // (frame 1 of each sequence); and so on.  But in the 'supervision' object,
+  // the lattice contains (sequence 0; sequence 1; ...).  So reordering is needed.
+  // This is done to make the code similar that for the 'chain' model.
+  std::vector<Index> indexes;
+
+  // The supervision object, containing the numerator and denominator 
+  // lattices.
+  discriminative::DiscriminativeSupervision supervision;
+
+  // This is a vector of per-frame weights, required to be between 0 and 1,
+  // that is applied to the derivative during training (but not during model
+  // combination, where the derivatives need to agree with the computed objf
+  // values for the optimization code to work).  The reason for this is to more
+  // exactly handle edge effects and to ensure that no frames are
+  // 'double-counted'.  The order of this vector corresponds to the order of
+  // the 'indexes' (i.e. all the first frames, then all the second frames,
+  // etc.)
+  // If this vector is empty it means we're not applying per-frame weights,
+  // so it's equivalent to a vector of all ones.  This vector is written
+  // to disk compactly as unsigned char.
+  Vector<BaseFloat> deriv_weights;
+  
+  // Use default assignment operator
+  NnetDiscriminativeSupervision() { }
+
+  // Initialize the object from an object of type discriminative::Supervision,
+  // and some extra information.  
+  // Note: you probably want to set 'name' to "output".
+  // 'first_frame' will often be zero but you can choose (just make it
+  // consistent with how you numbered your inputs), and 'frame_skip' would be 1
+  // in a vanilla setup, but 3 in the case of 'chain' models
+  NnetDiscriminativeSupervision(const std::string &name,
+                                const discriminative::DiscriminativeSupervision &supervision,
+                                const Vector<BaseFloat> &deriv_weights,
+                                int32 first_frame,
+                                int32 frame_skip);
+
+  NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary);
+  
+  void Swap(NnetDiscriminativeSupervision *other);
+
+  void CheckDim() const;
+  
+  bool operator == (const NnetDiscriminativeSupervision &other) const;
+};
+
+/// NnetDiscriminativeExample is like NnetExample, but specialized for 
+/// sequence training.
+struct NnetDiscriminativeExample {
+
+  /// 'inputs' contains the input to the network-- normally just it has just one
+  /// element called "input", but there may be others (e.g. one called
+  /// "ivector")...  this depends on the setup.
+  std::vector<NnetIo> inputs;
+
+  /// 'outputs' contains the sequence output supervision.  There will normally
+  /// be just one member with name == "output".
+  std::vector<NnetDiscriminativeSupervision> outputs;
+
+  void Write(std::ostream &os, bool binary) const;
+  
+  void Read(std::istream &is, bool binary);
+
+  void Swap(NnetDiscriminativeExample *other);
+
+  // Compresses the input features (if not compressed)
+  void Compress();
+
+  NnetDiscriminativeExample() { }
+
+  NnetDiscriminativeExample(const NnetDiscriminativeExample &other);
+
+  bool operator == (const NnetDiscriminativeExample &other) const {
+    return inputs == other.inputs && outputs == other.outputs;
+  }
+};
+
+/** 
+  Appends the given vector of examples (which must be non-empty) into 
+  a single output example.
+  Intended to be used when forming minibatches for neural net training. If 
+  'compress' it compresses the output features (recommended to save disk
+  space).
+
+  Note: the input is left as it was at the start, but it is temporarily
+  changed inside the function; this is a trick to allow us to use the
+  MergeExamples() routine while avoiding having to rewrite code.
+*/
+void MergeDiscriminativeExamples(
+    bool compress,
+    std::vector<NnetDiscriminativeExample> *input,
+    NnetDiscriminativeExample *output);
+
+// called from MergeDiscriminativeExamples, this function merges the Supervision
+// objects into one.  Requires (and checks) that they all have the same name.
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output); 
+
+
+/** Shifts the time-index t of everything in the input of "eg" by adding
+    "t_offset" to all "t" values-- but excluding those with names listed in
+    "exclude_names", e.g.  "ivector".  This might be useful if you are doing
+    subsampling of frames at the output, because shifted examples won't be quite
+    equivalent to their non-shifted counterparts.  "exclude_names" is a vector
+    of names of nnet inputs that we avoid shifting the "t" values of-- normally
+    it will contain just the single string "ivector" because we always leave t=0
+    for any ivector.
+
+    Note: input features will be shifted by 'frame_shift', and indexes in the
+    supervision in (eg->output) will be shifted by 'frame_shift' rounded to the
+    closest multiple of the frame subsampling factor (e.g. 3).  The frame
+    subsampling factor is worked out from the time spacing between the indexes
+    in the output.  */
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                                    const std::vector<std::string> &exclude_names,
+                                    NnetDiscriminativeExample *eg);
+
+/**
+   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
+   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
+   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
+   sequence).
+ */
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg);
+
+/**  This function takes a NnetDiscriminativeExample and produces a 
+     ComputationRequest.
+     Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
+     can create the ComputationRequest manually.  Assumes that if
+     need_model_derivative is true, you will be supplying derivatives w.r.t. all
+     outputs.
+*/
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *computation_request);
+
+
+typedef TableWriter<KaldiObjectHolder<NnetDiscriminativeExample > > NnetDiscriminativeExampleWriter;
+typedef SequentialTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > SequentialNnetDiscriminativeExampleReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > RandomAccessNnetDiscriminativeExampleReader;
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
new file mode 100644
index 00000000000..88a1701e92b
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -0,0 +1,247 @@
+// nnet3/nnet-discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
+                                   const NnetDiscriminativeOptions &opts,
+                                   const TransitionModel &tmodel,
+                                   const VectorBase<BaseFloat> &priors,
+                                   Nnet *nnet):
+    opts_(opts), tmodel_(tmodel), log_priors_(priors),
+    nnet_(nnet),
+    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    num_minibatches_processed_(0) {
+  if (opts.nnet_config.zero_component_stats)
+    ZeroComponentStats(nnet);
+  if (opts.nnet_config.momentum == 0.0 &&
+      opts.nnet_config.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
+                 opts.nnet_config.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    SetZero(is_gradient, delta_nnet_);
+  }
+  log_priors_.ApplyLog();
+}
+
+
+void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = true;
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.discriminative_config.xent_regularize != 0.0);
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(*nnet_, eg, need_model_derivative,
+                                      nnet_config.store_component_stats,
+                                      use_xent_regularization,
+                                      need_model_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  NnetComputer computer(nnet_config.compute_config, *computation,
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
+  // give the inputs to the computer object.
+  computer.AcceptInputs(*nnet_, eg.inputs);
+  computer.Forward();
+
+  this->ProcessOutputs(eg, &computer);
+  computer.Backward();
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - nnet_config.momentum);
+    if (nnet_config.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > nnet_config.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          SetZero(false, delta_nnet_);
+        } else {
+          scale *= nnet_config.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << nnet_config.max_param_change
+                    << ", scaling by "
+                    << nnet_config.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  }
+}
+
+
+void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &eg,
+                                               NnetComputer *computer) {
+  // normally the eg will have just one output named 'output', but
+  // we don't assume this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_->GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_->IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+
+    CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                          nnet_output.NumCols(),
+                                          kUndefined);
+    
+    bool use_xent = (opts_.discriminative_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+
+    discriminative::DiscriminativeObjectiveInfo stats(opts_.discriminative_config);
+    
+    if (objf_info_.count(sup.name) == 0) {
+      objf_info_[sup.name].stats.Configure(opts_.discriminative_config);
+      objf_info_[sup.name].stats.Reset();
+    }
+
+    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config, 
+                                      tmodel_, log_priors_,
+                                      sup.supervision, nnet_output,
+                                      &(objf_info_[sup.name].stats), 
+                                      &nnet_output_deriv,
+                                      (use_xent ? &xent_deriv : NULL));
+    
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      if (xent_objf != xent_objf) {
+        BaseFloat default_objf = -10;
+        xent_objf = default_objf;
+      }
+
+      discriminative::DiscriminativeObjectiveInfo xent_stats;
+      xent_stats.tot_t_weighted = stats.tot_t_weighted;
+      xent_stats.tot_objf = xent_objf;
+
+      objf_info_[xent_name].UpdateStats(xent_name, "xent",
+                                        opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_, xent_stats);
+    }
+
+    if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+      CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
+      nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
+    }
+
+    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+
+    objf_info_[sup.name].UpdateStats(sup.name, opts_.discriminative_config.criterion,
+                                     opts_.nnet_config.print_interval,
+                                     num_minibatches_processed_++,
+                                     stats);
+    
+    if (use_xent) {
+      xent_deriv.Scale(opts_.discriminative_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
+  }
+}
+
+
+bool NnetDiscriminativeTrainer::PrintTotalStats() const {
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo>::const_iterator
+      iter = objf_info_.begin(),
+      end = objf_info_.end();
+  bool ans = false;
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    const DiscriminativeObjectiveFunctionInfo &info = iter->second;
+    bool ret = info.PrintTotalStats(name, opts_.discriminative_config.criterion);
+    ans = ans || ret;
+  }
+
+  return ans;
+}
+
+
+void DiscriminativeObjectiveFunctionInfo::UpdateStats(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase,
+    int32 minibatch_counter,
+    discriminative::DiscriminativeObjectiveInfo this_minibatch_stats) {
+  int32 phase = minibatch_counter / minibatches_per_phase;
+  if (phase != current_phase) {
+    KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
+    PrintStatsForThisPhase(output_name, criterion, minibatches_per_phase);
+    current_phase = phase;
+    stats_this_phase.Reset();
+  }
+  stats_this_phase.Add(this_minibatch_stats);
+  stats.Add(this_minibatch_stats);
+}
+
+void DiscriminativeObjectiveFunctionInfo::PrintStatsForThisPhase(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase) const {
+  int32 start_minibatch = current_phase * minibatches_per_phase,
+      end_minibatch = start_minibatch + minibatches_per_phase - 1;
+
+  BaseFloat objf = (stats_this_phase.TotalObjf(criterion) / stats_this_phase.tot_t_weighted);
+  KALDI_LOG << "Average objective function for '" << output_name
+            << "' for minibatches " << start_minibatch
+            << '-' << end_minibatch << " is " << objf
+            << " over " << stats_this_phase.tot_t_weighted << " frames.";
+}
+
+bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &name,
+                const std::string &criterion) const {
+  BaseFloat objf = stats.TotalObjf(criterion) /stats.tot_t_weighted;
+  KALDI_LOG << "Overall average objective function for '" << name << "' is "
+            << objf << " over " << stats.tot_t_weighted << " frames.";
+  KALDI_LOG << "[this line is to be parsed by a script:] "
+            << criterion << "-per-frame="
+            << objf;
+  return (stats.tot_t_weighted != 0.0);
+}
+
+
+NnetDiscriminativeTrainer::~NnetDiscriminativeTrainer() {
+  delete delta_nnet_;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-training.h b/src/nnet3/nnet-discriminative-training.h
new file mode 100644
index 00000000000..4846aeca9d3
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.h
@@ -0,0 +1,131 @@
+// nnet3/nnet-discriminative-training.h
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//           2014-2015   Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-training.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+struct NnetDiscriminativeOptions {
+  NnetTrainerOptions nnet_config;
+  discriminative::DiscriminativeOptions discriminative_config;
+
+  bool apply_deriv_weights;
+
+  NnetDiscriminativeOptions(): apply_deriv_weights(true) { }
+
+  void Register(OptionsItf *opts) {
+    nnet_config.Register(opts);
+    discriminative_config.Register(opts);
+    opts->Register("apply-deriv-weights", &apply_deriv_weights,
+                   "If true, apply the per-frame derivative weights stored with "
+                   "the example.");
+  }
+};
+
+// This struct is used in multiple nnet training classes for keeping
+// track of objective function values.
+// Also see struct AccuracyInfo, in nnet-diagnostics.h.
+struct DiscriminativeObjectiveFunctionInfo {
+  int32 current_phase;
+
+  discriminative::DiscriminativeObjectiveInfo stats;
+  discriminative::DiscriminativeObjectiveInfo stats_this_phase;
+
+  DiscriminativeObjectiveFunctionInfo():
+      current_phase(0) { }
+
+  // This function updates the stats and, if the phase has just changed,
+  // prints a message indicating progress.  The phase equals
+  // minibatch_counter / minibatches_per_phase.  Its only function is to
+  // control how frequently we print logging messages.
+  void UpdateStats(const std::string &output_name,
+                   const std::string &criterion,
+                   int32 minibatches_per_phase,
+                   int32 minibatch_counter,
+                   discriminative::DiscriminativeObjectiveInfo stats);
+
+  // Prints stats for the current phase.
+  void PrintStatsForThisPhase(const std::string &output_name,
+                              const std::string &criterion,
+                              int32 minibatches_per_phase) const;
+  // Prints total stats, and returns true if total stats' weight was nonzero.
+  bool PrintTotalStats(const std::string &output_name,
+                       const std::string &criterion) const;
+};
+
+
+/**
+   This class is for single-threaded discriminative training of neural nets 
+*/
+class NnetDiscriminativeTrainer {
+ public:
+  NnetDiscriminativeTrainer(const NnetDiscriminativeOptions &config,
+                            const TransitionModel &tmodel,
+                            const VectorBase<BaseFloat> &priors,
+                            Nnet *nnet);
+
+  // train on one minibatch.
+  void Train(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  ~NnetDiscriminativeTrainer();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  const NnetDiscriminativeOptions opts_;
+
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  
+  Nnet *nnet_;
+
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0.  nnet representing
+                      // accumulated parameter-change (we'd call this
+                      // gradient_nnet_, but due to natural-gradient update,
+                      // it's better to consider it as a delta-parameter nnet.
+  CachingOptimizingCompiler compiler_;
+
+  int32 num_minibatches_processed_;
+
+  // This code supports multiple output layers, even though in the
+  // normal case there will be just one output layer named "output".
+  // So we store the objective functions per output layer.
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo, StringHasher> objf_info_;
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 99d41fb06c4..30f7840f6f8 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -219,5 +219,72 @@ void GetComputationRequest(const Nnet &nnet,
     KALDI_ERR << "No outputs in computation request.";
 }
 
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec) {
+  if (binary) {
+    int32 dim = vec.Dim();
+    std::vector<unsigned char> char_vec(dim);
+    const BaseFloat *data = vec.Data();
+    for (int32 i = 0; i < dim; i++) {
+      BaseFloat value = data[i];
+      KALDI_ASSERT(value >= 0.0 && value <= 1.0);
+      // below, the adding 0.5 is done so that we round to the closest integer
+      // rather than rounding down (since static_cast will round down).
+      char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
+    }
+    WriteIntegerVector(os, binary, char_vec);
+  } else {
+    // the regular floating-point format will be more readable for text mode.
+    vec.Write(os, binary);
+  }
+}
+
+void ReadVectorAsChar(std::istream &is,
+                      bool binary,
+                      Vector<BaseFloat> *vec) {
+  if (binary) {
+    BaseFloat scale = 1.0 / 255.0;
+    std::vector<unsigned char> char_vec;
+    ReadIntegerVector(is, binary, &char_vec);
+    int32 dim = char_vec.size();
+    vec->Resize(dim, kUndefined);
+    BaseFloat *data = vec->Data();
+    for (int32 i = 0; i < dim; i++)
+      data[i] = scale * char_vec[i];
+  } else {
+    vec->Read(is, binary);
+  }
+}
+
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap) {
+  if (*num_frames % frame_subsampling_factor != 0) {
+    int32 new_num_frames = frame_subsampling_factor *
+        (*num_frames / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames=" << new_num_frames;
+    *num_frames = new_num_frames;
+  }
+  if (*num_frames_overlap % frame_subsampling_factor != 0) {
+    int32 new_num_frames_overlap = frame_subsampling_factor *
+        (*num_frames_overlap / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames-overlap=" << new_num_frames_overlap;
+    *num_frames_overlap = new_num_frames_overlap;
+  }
+  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
+    KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
+              << "--num-frames=" << (*num_frames);
+  }
+
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index d54c3296dac..3e309e18915 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -63,6 +63,23 @@ void GetComputationRequest(const Nnet &nnet,
                            ComputationRequest *computation_request);
 
 
+// Writes as unsigned char a vector 'vec' that is required to have
+// values between 0 and 1.
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec);
+
+// Reads data written by WriteVectorAsChar.
+void ReadVectorAsChar(std::istream &is,
+                             bool binary,
+                             Vector<BaseFloat> *vec);
+
+// This function rounds up the quantities 'num_frames' and 'num_frames_overlap'
+// to the nearest multiple of the frame_subsampling_factor
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap);
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 7c69f623069..80793bf1d98 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -26,6 +26,21 @@
 namespace kaldi {
 namespace nnet3 {
 
+// used in I/O
+static void CopyPairVector(const CuArray<Int32Pair> &in,
+                        std::vector<std::pair<int32, int32> > *out) {
+  in.CopyToVec(reinterpret_cast<std::vector<Int32Pair>*>(out));
+}
+// used in I/O
+static void CopyPairVector(const std::vector<std::pair<int32, int32> > &in,
+                        CuArray<Int32Pair> *out) {
+  const std::vector<Int32Pair> *in_cast =
+      reinterpret_cast<const std::vector<Int32Pair>*>(&in);
+  out->CopyFromVec(*in_cast);
+}
+
+
+
 //inline
 void DistributeComponent::ComputeInputIndexAndBlock(const Index &output_index,
                                                     Index *input_index,
@@ -69,23 +84,18 @@ bool DistributeComponent::IsComputable(
   return true;
 }
 
-class DistributeComponentPrecomputedIndexes:
-      public ComponentPrecomputedIndexes {
- public:
-
-  // each pair is a pair (row, dim_offset), and by
-  // computing (input.Data() + row * input.Stride() + dim_offset)
-  // we get an address that points to the correct input location.
-  std::vector<std::pair<int32, int32> > pairs;
-
-  // this class has a virtual destructor so it can be deleted from a pointer
-  // to ComponentPrecomputedIndexes.
-  virtual ~DistributeComponentPrecomputedIndexes() { }
+void DistributeComponentPrecomputedIndexes::Write(std::ostream &ostream, bool binary) const {
+  WriteToken(ostream, binary, "<DistributeComponentPrecomputedIndexes>");
+  WriteToken(ostream, binary, "<Pairs>");
+  WriteIntegerPairVector(ostream, binary, pairs);
+  WriteToken(ostream, binary, "</DistributeComponentPrecomputedIndexes>");
+}
 
-  virtual ComponentPrecomputedIndexes* Copy() const {
-    return new DistributeComponentPrecomputedIndexes(*this);
-  }
-};
+void DistributeComponentPrecomputedIndexes::Read(std::istream &istream, bool binary) {
+  ExpectOneOrTwoTokens(istream, binary, "<DistributeComponentPrecomputedIndexes>", "<Pairs>");
+  ReadIntegerPairVector(istream, binary, &pairs);
+  ExpectToken(istream, binary, "</DistributeComponentPrecomputedIndexes>");
+}
 
 // virtual
 ComponentPrecomputedIndexes* DistributeComponent::PrecomputeIndexes(
@@ -240,5 +250,636 @@ void DistributeComponent::Read(std::istream &is, bool binary) {
 }
 
 
+void StatisticsExtractionComponentPrecomputedIndexes::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsExtractionComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > pairs_cpu;
+  CopyPairVector(forward_indexes, &pairs_cpu);
+  WriteIntegerPairVector(os, binary, pairs_cpu);
+  WriteToken(os, binary, "<Counts>");
+  counts.Write(os, binary);
+  WriteToken(os, binary, "<BackwardIndexes>");
+  std::vector<int32> backward_indexes_cpu;
+  backward_indexes.CopyToVec(&backward_indexes_cpu);
+  WriteIntegerVector(os, binary, backward_indexes_cpu);
+  WriteToken(os, binary, "</StatisticsExtractionComponentPrecomputedIndexes>");
+}
+
+void StatisticsExtractionComponentPrecomputedIndexes::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<StatisticsExtractionComponentPrecomputedIndexes>",
+                       "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > pairs_cpu;
+  ReadIntegerPairVector(is, binary, &pairs_cpu);
+  CopyPairVector(pairs_cpu, &forward_indexes);
+  ExpectToken(is, binary, "<Counts>");
+  counts.Read(is, binary);
+  ExpectToken(is, binary, "<BackwardIndexes>");
+  std::vector<int32> backward_indexes_cpu;
+  ReadIntegerVector(is, binary, &backward_indexes_cpu);
+  backward_indexes.CopyFromVec(backward_indexes_cpu);
+  ExpectToken(is, binary, "</StatisticsExtractionComponentPrecomputedIndexes>");
+}
+
+ComponentPrecomputedIndexes*
+StatisticsExtractionComponent::PrecomputeIndexes(
+    const MiscComputationInfo &misc_info,
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool need_backprop) const {
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  StatisticsExtractionComponentPrecomputedIndexes *ans = new
+      StatisticsExtractionComponentPrecomputedIndexes();
+  // both input and output indexes are assumed sorted first on
+  // n and x, then on t.
+  Int32Pair invalid_pair;
+  invalid_pair.first = -1;
+  invalid_pair.second = -1;
+  std::vector<Int32Pair> forward_indexes_cpu(output_indexes.size(),
+                                             invalid_pair);
+  std::vector<int32> backward_indexes_cpu(input_indexes.size(), -1);
+  Vector<BaseFloat> counts_cpu(output_indexes.size());
+
+  // this map maps from Index to the position in 'input_indexes'.
+  unordered_map<Index, int32, IndexHasher> index_to_input_pos;
+  for (int32 i = 0; i < num_input_indexes; i++)
+    index_to_input_pos[input_indexes[i]] = i;
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    Index output_index = output_indexes[i];
+    Index input_index(output_index);
+    int32 t = output_index.t,
+        t_start = output_period_ * (t / output_period_);
+    if (t_start > t)                // could happen for negative t_start due to
+      t_start -= output_period_;    // the way modulus works in C.
+    int32 t_end = t_start + output_period_;
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      unordered_map<Index, int32, IndexHasher>::iterator iter =
+          index_to_input_pos.find(input_index);
+      if (iter != index_to_input_pos.end()) {
+        int32 input_pos = iter->second;
+        if (forward_indexes_cpu[i].first == -1) {
+          forward_indexes_cpu[i].first = input_pos;
+          forward_indexes_cpu[i].second = input_pos + 1;
+          counts_cpu(i) = 1.0;
+        } else {
+          // the following might fail, for instance, if the sorting
+          // of the input or output indexes was not as expected.
+          KALDI_ASSERT(forward_indexes_cpu[i].second == input_pos);
+          forward_indexes_cpu[i].second++;
+          counts_cpu(i) += 1.0;
+        }
+        KALDI_ASSERT(backward_indexes_cpu[input_pos] == -1);
+        backward_indexes_cpu[input_pos] = i;
+      }
+    }
+    KALDI_ASSERT(counts_cpu(i) != 0.0);
+  }
+  for (int32 i = 0; i < num_input_indexes; i++) {
+    KALDI_ASSERT(backward_indexes_cpu[i] != -1);
+  }
+  ans->forward_indexes = forward_indexes_cpu;
+  ans->counts = counts_cpu;
+  if (need_backprop)
+    ans->backward_indexes = backward_indexes_cpu;
+  return ans;
+}
+
+StatisticsExtractionComponent::StatisticsExtractionComponent():
+    input_dim_(-1), input_period_(1), output_period_(1),
+    include_variance_(true) { }
+
+StatisticsExtractionComponent::StatisticsExtractionComponent(
+    const StatisticsExtractionComponent &other):
+    input_dim_(other.input_dim_),
+    input_period_(other.input_period_),
+    output_period_(other.output_period_),
+    include_variance_(other.include_variance_) {
+  Check();
+}
+
+void StatisticsExtractionComponent::InitFromConfig(ConfigLine *cfl) {
+  // input-dim is required.
+  bool ok = cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("input-period", &input_period_);
+  cfl->GetValue("output-period", &output_period_);
+  cfl->GetValue("include-variance", &include_variance_);
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok || input_dim_ <= 0 || input_period_ <= 0 || output_period_ <= 0 ||
+      (output_period_ % input_period_ != 0))
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Check();
+}
+
+void StatisticsExtractionComponent::Check() const {
+  if (!(input_dim_ > 0 && input_period_ > 0 && output_period_ > 0 &&
+        (output_period_ % input_period_) == 0))
+    KALDI_ERR << "Invalid configuration of StatisticsExtractionComponent";
+}
+
+void StatisticsExtractionComponent::ReorderIndexes(
+    std::vector<Index> *input_indexes,
+    std::vector<Index> *output_indexes) const {
+    std::sort(input_indexes->begin(), input_indexes->end(),
+              IndexLessNxt());
+    std::sort(output_indexes->begin(), output_indexes->end(),
+              IndexLessNxt());
+}
+
+bool StatisticsExtractionComponent::IsComputable(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    const IndexSet &input_index_set,
+    std::vector<Index> *used_inputs) const {
+  Index input_index(output_index);
+  int32 t = output_index.t,
+      t_start = output_period_ * (t / output_period_);
+  if (t_start > t)                // could happen for negative t_start due to
+    t_start -= output_period_;    // the way modulus works in C.
+  int32 t_end = t_start + output_period_;
+  if (!used_inputs) {
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index))
+        return true;
+    }
+    return false;
+  } else {
+    used_inputs->clear();
+    bool ans = false;
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index)) {
+        ans = true;
+        used_inputs->push_back(input_index);
+      }
+    }
+    return ans;
+  }
+}
+
+void StatisticsExtractionComponent::GetInputIndexes(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    std::vector<Index> *desired_indexes) const {
+  desired_indexes->clear();
+  Index input_index(output_index);
+  int32 t = output_index.t,
+      t_start = output_period_ * (t / output_period_);
+  if (t_start > t)                // could happen for negative t due to
+    t_start -= output_period_;    // the way modulus works in C
+  int32 t_end = t_start + output_period_;
+  for (int32 t = t_start; t < t_end; t += input_period_) {
+    input_index.t = t;
+    desired_indexes->push_back(input_index);
+  }
+}
+
+
+void StatisticsExtractionComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsExtractionComponentPrecomputedIndexes *indexes =
+     dynamic_cast<const StatisticsExtractionComponentPrecomputedIndexes*>(
+         indexes_in);
+  int32 num_rows_out = out->NumRows();
+  KALDI_ASSERT(indexes != NULL &&
+               indexes->forward_indexes.Dim() == num_rows_out &&
+               in.NumCols() == input_dim_ &&
+               out->NumCols() == OutputDim());
+  out->SetZero();
+  // store the counts.
+  out->CopyColFromVec(indexes->counts, 0);
+  // store the mean stats
+  out->ColRange(1, input_dim_).AddRowRanges(in, indexes->forward_indexes);
+  if (include_variance_) {
+    // store the variance (sum-squared) stats.
+    CuMatrix<BaseFloat> in_squared(in);
+    in_squared.ApplyPow(2.0);
+    out->ColRange(input_dim_ + 1,
+                  input_dim_).AddRowRanges(in_squared,
+                                           indexes->forward_indexes);
+  }
+}
+
+void StatisticsExtractionComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *, // to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsExtractionComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsExtractionComponentPrecomputedIndexes*>(indexes_in);
+  in_deriv->SetZero();
+  in_deriv->AddRows(1.0, out_deriv.ColRange(1, input_dim_),
+                    indexes->backward_indexes);
+  if (include_variance_) {
+    CuMatrix<BaseFloat> variance_deriv(in_value.NumRows(),
+                                       in_value.NumCols(),
+                                       kUndefined);
+    variance_deriv.CopyRows(out_deriv.ColRange(1 + input_dim_, input_dim_),
+                            indexes->backward_indexes);
+    in_deriv->AddMatMatElements(2.0, variance_deriv, in_value, 1.0);
+  }
+}
+
+void StatisticsExtractionComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<StatisticsExtractionComponent>",
+                       "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<InputPeriod>");
+  ReadBasicType(is, binary, &input_period_);
+  ExpectToken(is, binary, "<OutputPeriod>");
+  ReadBasicType(is, binary, &output_period_);
+  ExpectToken(is, binary, "<IncludeVarinance>");
+  ReadBasicType(is, binary, &include_variance_);
+  ExpectToken(is, binary, "</StatisticsExtractionComponent>");
+  Check();
+}
+
+void StatisticsExtractionComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsExtractionComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<InputPeriod>");
+  WriteBasicType(os, binary, input_period_);
+  WriteToken(os, binary, "<OutputPeriod>");
+  WriteBasicType(os, binary, output_period_);
+  WriteToken(os, binary, "<IncludeVarinance>");
+  WriteBasicType(os, binary, include_variance_);
+  WriteToken(os, binary, "</StatisticsExtractionComponent>");
+}
+
+void StatisticsPoolingComponentPrecomputedIndexes::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsPoolingComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > indexes_cpu;
+  CopyPairVector(forward_indexes, &indexes_cpu);
+  WriteIntegerPairVector(os, binary, indexes_cpu);
+  WriteToken(os, binary, "<BackwardIndexes>");
+  CopyPairVector(backward_indexes, &indexes_cpu);
+  WriteIntegerPairVector(os, binary, indexes_cpu);
+  WriteToken(os, binary, "</StatisticsPoolingComponentPrecomputedIndexes>");
+}
+
+void StatisticsPoolingComponentPrecomputedIndexes::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<StatisticsPoolingComponentPrecomputedIndexes>",
+                       "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > indexes_cpu;
+  ReadIntegerPairVector(is, binary, &indexes_cpu);
+  CopyPairVector(indexes_cpu, &forward_indexes);
+  ExpectToken(is, binary, "<BackwardIndexes>");
+  ReadIntegerPairVector(is, binary, &indexes_cpu);
+  CopyPairVector(indexes_cpu, &backward_indexes);
+  ExpectToken(is, binary, "</StatisticsPoolingComponentPrecomputedIndexes>");
+}
+
+void StatisticsPoolingComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("input-period", &input_period_);
+  cfl->GetValue("left-context", &left_context_);
+  cfl->GetValue("right-context", &right_context_);
+  cfl->GetValue("num-log-count-features", &num_log_count_features_);
+  cfl->GetValue("output-stddevs", &output_stddevs_);
+  cfl->GetValue("variance-floor", &variance_floor_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+	      << cfl->UnusedValues();
+  // do some basic checks here but Check() will check more completely.
+  if (!ok || input_dim_ <= 0 || left_context_ + right_context_ <= 0 ||
+      num_log_count_features_ < 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Check();
+}
+
+StatisticsPoolingComponent::StatisticsPoolingComponent():
+    input_dim_(-1), input_period_(1), left_context_(-1), right_context_(-1),
+    num_log_count_features_(0), output_stddevs_(false),
+    variance_floor_(1.0e-10) { }
+
+
+StatisticsPoolingComponent::StatisticsPoolingComponent(
+    const StatisticsPoolingComponent &other):
+    input_dim_(other.input_dim_), input_period_(other.input_period_),
+    left_context_(other.left_context_), right_context_(other.right_context_),
+    num_log_count_features_(other.num_log_count_features_),
+    output_stddevs_(other.output_stddevs_),
+    variance_floor_(1.0e-10) {
+  Check();
+}
+
+void StatisticsPoolingComponent::Check() const {
+  KALDI_ASSERT(input_dim_ > 0);
+  KALDI_ASSERT(input_period_ > 0);
+  KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0 &&
+               left_context_ + right_context_ > 0);
+  KALDI_ASSERT(left_context_ % input_period_ == 0 &&
+               right_context_ % input_period_ == 0);
+  KALDI_ASSERT(variance_floor_ > 0.0 && variance_floor_ < 1.0);
+  KALDI_ASSERT(!output_stddevs_ || (input_dim_ - 1) % 2 == 0);
+}
+
+void StatisticsPoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<StatisticsPoolingComponent>",
+                       "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<InputPeriod>");
+  ReadBasicType(is, binary, &input_period_);
+  ExpectToken(is, binary, "<LeftContext>");
+  ReadBasicType(is, binary, &left_context_);
+  ExpectToken(is, binary, "<RightContext>");
+  ReadBasicType(is, binary, &right_context_);
+  ExpectToken(is, binary, "<NumLogCountFeatures>");
+  ReadBasicType(is, binary, &num_log_count_features_);
+  ExpectToken(is, binary, "<OutputStddevs>");
+  ReadBasicType(is, binary, &output_stddevs_);
+  ExpectToken(is, binary, "<VarianceFloor>");
+  ReadBasicType(is, binary, &variance_floor_);
+  ExpectToken(is, binary, "</StatisticsPoolingComponent>");
+  Check();
+}
+
+void StatisticsPoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsPoolingComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<InputPeriod>");
+  WriteBasicType(os, binary, input_period_);
+  WriteToken(os, binary, "<LeftContext>");
+  WriteBasicType(os, binary, left_context_);
+  WriteToken(os, binary, "<RightContext>");
+  WriteBasicType(os, binary, right_context_);
+  WriteToken(os, binary, "<NumLogCountFeatures>");
+  WriteBasicType(os, binary, num_log_count_features_);
+  WriteToken(os, binary, "<OutputStddevs>");
+  WriteBasicType(os, binary, output_stddevs_);
+  WriteToken(os, binary, "<VarianceFloor>");
+  WriteBasicType(os, binary, variance_floor_);
+  WriteToken(os, binary, "</StatisticsPoolingComponent>");
+}
+
+void StatisticsPoolingComponent::ReorderIndexes(
+    std::vector<Index> *input_indexes,
+    std::vector<Index> *output_indexes) const {
+    std::sort(input_indexes->begin(), input_indexes->end(),
+              IndexLessNxt());
+    std::sort(output_indexes->begin(), output_indexes->end(),
+              IndexLessNxt());
+}
+
+void StatisticsPoolingComponent::GetInputIndexes(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    std::vector<Index> *desired_indexes) const {
+  desired_indexes->clear();
+  Index input_index(output_index);
+  int32 middle_t = output_index.t,
+      t_start = middle_t - left_context_,
+      t_last = middle_t + right_context_;
+  KALDI_ASSERT(middle_t % input_period_ == 0);
+  for (int32 t = t_start; t <= t_last; t += input_period_) {
+    input_index.t = t;
+    desired_indexes->push_back(input_index);
+  }
+}
+
+bool StatisticsPoolingComponent::IsComputable(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    const IndexSet &input_index_set,
+    std::vector<Index> *used_inputs) const {
+  if (used_inputs)
+    used_inputs->clear();
+  // you are not supposed to access the output of this component other than at
+  // multiples of the input period.  We could make this an error but decided to
+  // just have it return false.
+  if (output_index.t % input_period_ != 0)
+    return false;
+
+  Index input_index(output_index);
+  int32 output_t = output_index.t,
+      t_start = output_t - left_context_,
+      t_last = output_t + right_context_;
+  if (!used_inputs) {
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index))
+        return true;
+    }
+    return false;
+  } else {
+    bool ans = false;
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index)) {
+        ans = true;
+        used_inputs->push_back(input_index);
+      }
+    }
+    return ans;
+  }
+}
+
+ComponentPrecomputedIndexes*
+StatisticsPoolingComponent::PrecomputeIndexes(
+    const MiscComputationInfo &misc_info,
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool need_backprop) const {
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  StatisticsPoolingComponentPrecomputedIndexes *ans = new
+      StatisticsPoolingComponentPrecomputedIndexes();
+
+  Int32Pair invalid_pair;
+  invalid_pair.first = -1;
+  invalid_pair.second = -1;
+  // forward_indexes_cpu[i] will be the (begin, end) of input indexes
+  // included in the sum for the i'th output index.
+  std::vector<Int32Pair> forward_indexes_cpu(num_output_indexes,
+                                             invalid_pair);
+  // backward_indexes_cpu[i] will be the (begin, end) of output indexes
+  // for which the i'th input index participates in the sum.
+  // because of the way the indexes are sorted (and the fact that only
+  // required indexes are present at the input), it naturally has this
+  // structure [i.e. no gaps in the sets of indexes].
+  std::vector<Int32Pair> backward_indexes_cpu(num_input_indexes,
+                                              invalid_pair);
+
+  // this map maps from Index to the position in 'input_indexes'.
+  unordered_map<Index, int32, IndexHasher> index_to_input_pos;
+  for (int32 i = 0; i < num_input_indexes; i++)
+    index_to_input_pos[input_indexes[i]] = i;
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    Index input_index(output_indexes[i]);
+    int32 middle_t = input_index.t,
+        t_start = middle_t - left_context_,
+        t_last = middle_t + right_context_;
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      unordered_map<Index, int32, IndexHasher>::iterator iter =
+          index_to_input_pos.find(input_index);
+      if (iter != index_to_input_pos.end()) {
+        int32 input_pos = iter->second;
+        if (forward_indexes_cpu[i].first == -1) {
+          forward_indexes_cpu[i].first = input_pos;
+          forward_indexes_cpu[i].second = input_pos + 1;
+        } else {
+          KALDI_ASSERT(forward_indexes_cpu[i].second == input_pos);
+          forward_indexes_cpu[i].second++;
+        }
+        if (backward_indexes_cpu[input_pos].first == -1) {
+          backward_indexes_cpu[input_pos].first = i;
+          backward_indexes_cpu[input_pos].second = i + 1;
+        } else {
+          KALDI_ASSERT(backward_indexes_cpu[input_pos].second == i);
+          backward_indexes_cpu[input_pos].second++;
+        }
+      }
+    }
+    KALDI_ASSERT(forward_indexes_cpu[i].first != -1);
+  }
+  for (int32 i = 0; i < num_input_indexes; i++) {
+    KALDI_ASSERT(backward_indexes_cpu[i].first != -1);
+  }
+
+  ans->forward_indexes = forward_indexes_cpu;
+  if (need_backprop)
+    ans->backward_indexes = backward_indexes_cpu;
+  return ans;
+}
+
+void StatisticsPoolingComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->SetZero();
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsPoolingComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsPoolingComponentPrecomputedIndexes*>(indexes_in);
+  int32 num_rows_out = out->NumRows();
+  KALDI_ASSERT(indexes != NULL &&
+               indexes->forward_indexes.Dim() == num_rows_out &&
+               in.NumCols() == input_dim_ &&
+               out->NumCols() == OutputDim());
+  CuVector<BaseFloat> counts(num_rows_out);
+  // counts_mat is a fake matrix with one column, containing the counts.
+  CuSubMatrix<BaseFloat> counts_mat(counts.Data(), num_rows_out, 1, 1);
+  counts_mat.AddRowRanges(in.ColRange(0, 1), indexes->forward_indexes);
+
+  CuSubMatrix<BaseFloat> out_non_count(*out, 0, num_rows_out,
+                                       num_log_count_features_, input_dim_ - 1);
+  out_non_count.AddRowRanges(in.ColRange(1, input_dim_ - 1),
+                             indexes->forward_indexes);
+  out_non_count.DivRowsVec(counts);
+
+  if (num_log_count_features_ > 0) {
+    counts.ApplyLog();
+    CuVector<BaseFloat> ones(num_log_count_features_, kUndefined);
+    ones.Set(1.0);
+    out->ColRange(0, num_log_count_features_).AddVecVec(1.0, counts, ones);
+  }
+
+  if (output_stddevs_) {
+    // if this is true, then we assume the input contains x^2 stats as well as x
+    // stats, and we want to process them into a standard deviation.
+    KALDI_ASSERT((input_dim_ - 1) % 2 == 0);
+    int32 feature_dim = (input_dim_ - 1) / 2;
+    CuSubMatrix<BaseFloat> mean(*out, 0, num_rows_out,
+                                num_log_count_features_, feature_dim),
+        variance(*out, 0, num_rows_out,
+                 num_log_count_features_ + feature_dim, feature_dim);
+    // subtract mean-squared from average of x^2 to get the variance.
+    variance.AddMatMatElements(-1.0, mean, mean, 1.0);
+    variance.ApplyFloor(variance_floor_);
+    // compute the standard deviation via square root.
+    variance.ApplyPow(0.5);
+  }
+}
+
+void StatisticsPoolingComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv_in,
+    Component *, // to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsPoolingComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsPoolingComponentPrecomputedIndexes*>(
+          indexes_in);
+  int32 num_rows_out = out_deriv_in.NumRows();
+  CuMatrix<BaseFloat> out_deriv(out_deriv_in);
+  if (output_stddevs_) {
+    // for now we actually ignore the covariance flooring in the backprop- this
+    // is an approximation.  Typically the derivatives computed will be quite
+    // tiny for floored variances (they should be zero), so it won't affect the
+    // derivatives much.
+    int32 feature_dim = (input_dim_ - 1) / 2;
+    CuSubMatrix<BaseFloat> mean_deriv(out_deriv, 0, num_rows_out,
+                                      num_log_count_features_, feature_dim),
+        variance_deriv(out_deriv, 0, num_rows_out,
+                       num_log_count_features_ + feature_dim, feature_dim),
+        mean_value(out_value, 0, num_rows_out,
+                   num_log_count_features_, feature_dim),
+        stddev_value(out_value, 0, num_rows_out,
+                     num_log_count_features_ + feature_dim, feature_dim);
+    // we currently have the deriv w.r.t. the stddev.  step 1 is to get it
+    // w.r.t. the centered variance.  If the centered variance is s,
+    // and the stddev is sqrt(s), then d/ds sqrt(s) = 0.5 / sqrt(s),
+    // so we need to multiply variance_deriv by 0.5 / the stddev.
+    variance_deriv.DivElements(stddev_value);
+    variance_deriv.Scale(0.5);
+
+    // the deriv w.r.t. the uncentered variance is the same as w.r.t.  the
+    // uncentered variance (since they difer by a constant term of -(mean *
+    // mean), but we need to add to dF/dmean, the value -2.0 * mean *
+    // dF/dvariance.
+    mean_deriv.AddMatMatElements(-2.0, mean_value, variance_deriv, 1.0);
+  }
+  // now we have to account for the effect of division by the count, on
+  // the derivative.
+  CuVector<BaseFloat> counts(num_rows_out, kUndefined);
+  if (num_log_count_features_ > 0) {
+    counts.CopyColFromMat(out_value, 0);
+    counts.ApplyExp();
+  } else {
+    counts.SetZero();
+    // we need to recompute the counts from the input since they are not in the
+    // output.  The submatrix initializer below takes num-rows, num-cols,
+    // stride;  num-cols and stride are 1.
+    CuSubMatrix<BaseFloat> counts_mat(counts.Data(), num_rows_out, 1, 1);
+    counts_mat.AddRowRanges(in_value.ColRange(0, 1), indexes->forward_indexes);
+  }
+  // Divide the output derivative by the counts.  This is what we want as it
+  // concerns the mean and x^2 stats.  As for the counts themselves, the
+  // derivative will end up being discarded when we backprop to the
+  // StatisticsExtractionComponent (as the count is not differentiable) so it
+  // doesn't really matter.
+  out_deriv.DivRowsVec(counts);
+
+  // Now propagate the derivative back to the input.  we don't propagate it
+  // back for the count's row since it's non-differentiable.
+  in_deriv->ColRange(1, input_dim_ - 1).
+      AddRowRanges(out_deriv.ColRange(num_log_count_features_, input_dim_ - 1),
+                   indexes->backward_indexes);
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index cc82057c6b0..e7c2ff3a78e 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -44,6 +44,11 @@ namespace nnet3 {
    value 'x' at the output will map to block 'x % n_blocks' of the dimension
    blocks of the input, and to an x value 'x / n_blocks' of the input.  For negative
    x values the % and / operations are always rounded down, not towards zero.
+
+   The config line is of the form
+     input-dim=xx output-dim=xx
+   where input-dim must be a multiple of the output-dim, and n_blocks is
+   set to input-dim / output-dim.
    */
 class DistributeComponent: public Component {
  public:
@@ -84,6 +89,8 @@ class DistributeComponent: public Component {
                                const Index &output_index,
                                std::vector<Index> *desired_indexes) const;
 
+  // This function returns true if at least one of the input indexes used to
+  // compute this output index is computable.
   virtual bool IsComputable(const MiscComputationInfo &misc_info,
                             const Index &output_index,
                             const IndexSet &input_index_set,
@@ -119,6 +126,319 @@ class DistributeComponent: public Component {
 
 };
 
+class DistributeComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // each pair is a pair (row, dim_offset), and by
+  // computing (input.Data() + row * input.Stride() + dim_offset)
+  // we get an address that points to the correct input location.
+  std::vector<std::pair<int32, int32> > pairs;
+
+  // this class has a virtual destructor so it can be deleted from a pointer
+  // to ComponentPrecomputedIndexes.
+  virtual ~DistributeComponentPrecomputedIndexes() { }
+
+  virtual ComponentPrecomputedIndexes* Copy() const {
+    return new DistributeComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &ostream, bool binary) const;
+
+  virtual void Read(std::istream &istream, bool binary);
+
+  virtual std::string Type() const { return "DistributeComponentPrecomputedIndexes"; }
+};
+
+/*
+  Class StatisticsExtractionComponent is used together with
+  StatisticsPoolingComponent to extract moving-average mean and
+  standard-deviation statistics.
+
+  StatisticsExtractionExomponent designed to extract statistics-- 0th-order,
+  1st-order and optionally diagonal 2nd-order stats-- from small groups of
+  frames, such as 10 frame.  The statistics will then be further processed by
+  StatisticsPoolingComponent to compute moving-average means and (if configured)
+  standard deviations.  The reason for the two-component way of doing this is
+  efficiency, particularly in the graph-compilation phase.  (Otherwise there
+  would be too many dependencies to process).  The StatisticsExtractionComponent
+  is designed to let you extract statistics from fixed-size groups of frames
+  (e.g. 10 frames), and in StatisticsPoolingComponent you are only expected to
+  compute the averages at the same fixed period (e.g. 10 frames), so it's more
+  efficient than if you were to compute a moving average at every single frame;
+  and the computation of the intermediate stats means that most of the
+  computation that goes into extracting the means and standard deviations for
+  nearby frames is shared.
+
+  The config line in a typical setup will be something like:
+
+    input-dim=250 input-period=1 output-period=10 include-variance=true
+
+  input-dim is self-explanatory.  The inputs will be obtained at multiples of
+  input-period (e.g. it might be 3 for chain models).  output-period must be a
+  multiple of input period, and the requested output indexes will be expected to
+  be multiples of output-period (which you can ensure through use of the Round
+  descriptor).  For instance, if you request the output on frame 80, it will
+  consist of stats from input frames 80 through 89.
+
+  An output of this component will be 'computable' any time at least one of
+  the corresponding inputs is computable.
+
+   In all cases the first dimension of the output will be a count (between 1 and
+  10 inclusive in this example).  If include-variance=false, then the output
+  dimension will be input-dim + 1.  and the output dimensions >0 will be
+  1st-order statistics (sums of the input).  If include-variance=true, then the
+  output dimension will be input-dim * 2 + 1, where the raw diagonal 2nd-order
+  statistics are appended to the 0 and 1st order statistics.
+
+  The default configuration values are:
+     input-dim=-1 input-period=1 output-period=1 include-variance=true
+ */
+class StatisticsExtractionComponent: public Component {
+ public:
+  // Initializes to defaults which would not pass Check(); use InitFromConfig()
+  // or Read() or copy constructor to really initialize.
+  StatisticsExtractionComponent();
+  // copy constructor, used in Copy().
+  StatisticsExtractionComponent(const StatisticsExtractionComponent &other);
+
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    // count + sum stats [ + sum-squared stats].
+    return 1 + input_dim_ + (include_variance_ ? input_dim_ : 0);
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "StatisticsExtractionComponent"; }
+  virtual int32 Properties() const {
+    return kPropagateAdds|kReordersIndexes|
+        (include_variance_ ? kBackpropNeedsInput : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const {
+    return new StatisticsExtractionComponent(*this);
+  }
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const;
+
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const;
+
+  // This function reorders the input and output indexes so that they
+  // are sorted first on n and then x and then t.
+  virtual void ReorderIndexes(std::vector<Index> *input_indexes,
+                              std::vector<Index> *output_indexes) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+ private:
+  // Checks that the parameters are valid.
+  void Check() const;
+
+  // Disallow assignment operator.
+  StatisticsExtractionComponent &operator =(
+      const StatisticsExtractionComponent &other);
+
+  int32 input_dim_;
+  int32 input_period_;
+  int32 output_period_;
+  bool include_variance_;
+};
+
+class StatisticsExtractionComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+  // While creating the output we sum over row ranges of the input.
+  // forward_indexes.Dim() equals the number of rows of the output, and each
+  // element is a (start, end) range of inputs, that is summed over.
+  CuArray<Int32Pair> forward_indexes;
+
+  // this vector stores the number of inputs for each output.  Normally this will be
+  // the same as the component's output_period_ / input_period_, but could be less
+  // due to edge effects at the utterance boundary.
+  CuVector<BaseFloat> counts;
+
+  // Each input row participates in exactly one output element, and
+  // 'backward_indexes' identifies which row of the output each row
+  // of the input is part of.  It's used in backprop.
+  CuArray<int32> backward_indexes;
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new StatisticsExtractionComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const { return "StatisticsExtractionComponentPrecomputedIndexes"; }
+ private:
+  virtual ~StatisticsExtractionComponentPrecomputedIndexes() { }
+};
+
+/*
+  Class StatisticsPoolingComponent is used together with
+  StatisticsExtractionComponent to extract moving-average mean and
+  standard-deviation statistics.
+
+  StatisticsPoolingComponent pools the stats over a specified window and
+  computes means and possibly log-count and stddevs from them for you.
+
+ # In StatisticsPoolingComponent, the first element of the input is interpreted
+ # as a count, which we divide by.
+ # Optionally the log of the count can be output, and you can allow it to be
+ # repeated several times if you want (useful for systems using the jesus-layer).
+ # The output dimension is equal to num-log-count-features plus (input-dim - 1).
+
+ # If include-log-count==false, the output dimension is the input dimension minus one.
+ # If output-stddevs=true, then it expects the input-dim to be of the form 2n+1 where n is
+ #  presumably the original feature dim, and it interprets the last n dimensions of the feature
+ #  as a variance; it outputs the square root of the variance instead of the actual variance.
+
+ configs and their defaults:  input-dim=-1, input-period=1, left-context=-1, right-context=-1,
+    num-log-count-features=0, output-stddevs=true, variance-floor=1.0e-10
+
+ You'd access the output of the StatisticsPoolingComponent using rounding, e.g.
+  Round(component-name, 10)
+ or whatever, instead of just component-name, because its output is only defined at multiples
+ of its input-period.
+
+ The output of StatisticsPoolingComponent will only be defined if at least one input was defined.
+ */
+class StatisticsPoolingComponent: public Component {
+ public:
+  // Initializes to defaults which would not pass Check(); use InitFromConfig()
+  // or Read() or copy constructor to really initialize.
+  StatisticsPoolingComponent();
+  // copy constructor, used in Copy()
+  StatisticsPoolingComponent(const StatisticsPoolingComponent &other);
+
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return input_dim_ + num_log_count_features_ - 1;
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "StatisticsPoolingComponent"; }
+  virtual int32 Properties() const {
+    return kReordersIndexes|kBackpropAdds|
+        (output_stddevs_ || num_log_count_features_ > 0 ?
+         kBackpropNeedsOutput : 0) |
+        (num_log_count_features_ == 0 ? kBackpropNeedsInput : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const {
+    return new StatisticsPoolingComponent(*this);
+  }
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const;
+
+  // returns true if at least one of its inputs is computable.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const;
+
+  // This function reorders the input and output indexes so that they
+  // are sorted first on n and then x and then t.
+  virtual void ReorderIndexes(std::vector<Index> *input_indexes,
+                              std::vector<Index> *output_indexes) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+ private:
+  // Checks that the parameters are valid.
+  void Check() const;
+
+  // Disallow assignment operator.
+  StatisticsPoolingComponent &operator =(
+      const StatisticsPoolingComponent &other);
+
+  int32 input_dim_;
+  int32 input_period_;
+  int32 left_context_;
+  int32 right_context_;
+  int32 num_log_count_features_;
+  bool output_stddevs_;
+  BaseFloat variance_floor_;
+};
+
+class StatisticsPoolingComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // in the first stage of creating the output we sum over row ranges of
+  // the input.  forward_indexes.Dim() equals the number of rows of the
+  // output, and each element is a (start, end) range of inputs, that is
+  // summed over.
+  CuArray<Int32Pair> forward_indexes;
+
+  // backward_indexes contains the same information as forward_indexes, but in a
+  // different format.  backward_indexes.Dim() is the same as the number of rows
+  // of input, and each element contains the (start,end) of the range of outputs
+  // for which this input index appears as an element of the sum for that
+  // output.  This is possible because of the way the inputs and outputs are
+  // ordered and because of how we select the elments to appear in the sum using
+  // a window.  This quantity is used in backprop.
+  CuArray<Int32Pair> backward_indexes;
+
+  virtual ~StatisticsPoolingComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new StatisticsPoolingComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; }
+};
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index e6c6baf3e1e..8dea02b8918 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+//                2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -139,6 +139,12 @@ Component *Nnet::GetComponent(int32 c) {
   return components_[c];
 }
 
+void Nnet::SetComponent(int32 c, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
+  delete components_[c];
+  components_[c] = component;
+}
+
 /// Returns true if this is component-input node, i.e. a node of type kDescriptor
 /// that immediately precedes a node of type kComponent.
 bool Nnet::IsComponentInputNode(int32 node) const {
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 89c0d4810dd..a48fbb26f88 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.h
 
 // Copyright   2012-2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -130,6 +130,9 @@ class Nnet {
   /// caller.
   const Component *GetComponent(int32 c) const;
 
+  /// Replace the component indexed by c with a new component.
+  /// Frees previous component indexed by c.
+  void SetComponent(int32 c, Component *component);
 
   /// returns const reference to a particular numbered network node.
   const NetworkNode &GetNode(int32 node) const {
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 911901bf5b1..df7f975db86 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -895,6 +895,17 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
       computation_->commands[alloc_discard].command_type =
           kNoOperation;
   }
+
+  //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
+  //  matrix to keep's stride_type to kStrideEqualNuMCols.
+  if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
+    computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
+    // ... and perform an additional check.
+    KALDI_ASSERT(computation_->matrices[m_to_discard].num_rows ==
+                 computation_->matrices[m_to_keep].num_rows &&
+                 computation_->matrices[m_to_discard].num_cols ==
+                 computation_->matrices[m_to_keep].num_cols);
+  }
 }
 
 void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
@@ -972,7 +983,16 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
       right = config_.allow_right_merge;
   // condition c3:
   if (!computation_->IsWholeMatrix(s2)) left = false;
+  // condition c4:
   if (!computation_->IsWholeMatrix(s1)) right = false;
+  // condition c6:
+  if (computation_->matrices[m2].stride_type == kStrideEqualNumCols &&
+      !computation_->IsWholeMatrix(s1)) left = false;
+  // condition c7:
+  if (computation_->matrices[m1].stride_type == kStrideEqualNumCols &&
+      !computation_->IsWholeMatrix(s2)) right = false;
+
+
   if (!left && !right)  // save some time.
     return std::pair<bool,bool>(false,false);
   bool is_assignment = (computation_->commands[command_index].command_type ==
@@ -1028,6 +1048,7 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
   int32 first_submatrix = submatrices[0];
   int32 num_cols = computation_->submatrices[first_submatrix].num_cols,
       num_rows = 0;
+  MatrixStrideType stride_type = kDefaultStride;
   NnetComputation::MatrixDebugInfo debug_info;
   for (int32 i = 0; i < num_submatrices; i++) {
     int32 submatrix = submatrices[i];
@@ -1035,10 +1056,16 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     KALDI_ASSERT(computation_->submatrices[submatrix].num_cols == num_cols);
     if (!computation_->matrix_debug_info.empty())
       AppendDebugInfoForSubmatrix(submatrix, &debug_info);
+    if (computation_->IsWholeMatrix(submatrix)) {
+      int32 matrix = computation_->submatrices[submatrix].matrix_index;
+      if (computation_->matrices[matrix].stride_type == kStrideEqualNumCols)
+        stride_type = kStrideEqualNumCols;
+    }
   }
   // new_whole_submatrix is a new submatrix index corresponding to the whole
   // of a new matrix that we are creating.
-  int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols);
+  int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
+                                                      stride_type);
   // Add a command at the very start, to initialize this new matrix.
   int32 new_matrix_index =
       computation_->submatrices[new_whole_submatrix].matrix_index;
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 4aebce34b20..d82867252ec 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -68,19 +68,28 @@ struct NnetOptimizeOptions;  // Forward declaration.
        no such command, then the command index of the deallocation command
        for 'submatrix'; or if this does not exist, then num-commands.
 
-   The conditions that must be satisfied for both left and right merges:
-     - It cannot be the case that m1 and m2 are both inputs, or that they are
-       both outputs.  [condition c1]
-     - If either m1 or m2 is an input or an output, then s1 must be the entirety
-       of m1 and s2 must be the entirety of m2 (this is because inputs and outputs
-       must be whole matrices). [condition c2]
-     - If we are left-merging (deleting s2,m2), then s2 must be the entirety of m2.
-       [condition c3]
-     - If we are right-merging (deleting s1,m1), then s1 must be the entirety of m1.
-       [condition c4]
-     - None of the the variables underlying s1 and s2 may be marked as 'dirty'
-       (implying that they were the subjects of a previous merge during the lifetime of
-       this class) [condition c5]
+   The conditions that must be satisfied for merges are as follows:
+     - Condition c1: it cannot be the case that m1 and m2 are both inputs, or
+       that they are both outputs.
+     - Condition c2: If either m1 or m2 is an input or an output, then s1 must
+       be the entirety of m1 and s2 must be the entirety of m2 (this is because
+       inputs and outputs must be whole matrices).
+     - Condition c3: if we are left-merging (deleting s2,m2), then s2 must be the
+       entirety of m2.
+     - Condition c4: If we are right-merging (deleting s1,m1), then s1 must be
+       the entirety of m1.
+     - Condition c5: None of the the variables underlying s1 and s2 may be
+       marked as 'dirty' (implying that they were the subjects of a previous
+       merge during the lifetime of this class).
+     - Condition c6: if we are left-merging (deleting s2, m2) and m2 has
+       stride_type == kStrideEqualNumCols, then s1 must be the entirety of m1.
+       [note: because of condition c3, we can assume that s2 is the entirety of
+       m2.]
+     - Condition c7: if we are right-merging (deleting s1, m1) and m1 has
+       stride_type == kStrideEqualNumCols, then s2 must be the entirety of m2.
+       [note: because of condition c4, we can assume that s1 is the entirety of
+       m1.]
+
 
    If the command C is case (a), i.e. an assignment operation, then the following
    conditions must apply:
@@ -112,6 +121,8 @@ struct NnetOptimizeOptions;  // Forward declaration.
        be an output).  [note: previously we kept the later of the 2 commands,
        but this had the effect of making inaccurate the Analyzer info for
        a matrix (m2) that might later be used.
+     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
+       to kStrideEqualNuMCols.
 
 
    The sequence of things we have to do for a right-merge (in which we delete
@@ -120,8 +131,8 @@ struct NnetOptimizeOptions;  // Forward declaration.
        [later we'll renumber so that there are no duplicates.]
      - If m2 was an output, replace it as an output with m1 and remove the
        command that deallocated m1.
-     ... the last three bullet-points, regarding removing the assignment
-        command, and allocation and deallocation, are the same as for a
+     ... the last four bullet-points, regarding removing the assignment command,
+        and allocation and deallocation, and stride-type, are the same as for a
         left-merge, except swap m1 and m2.
 
    At the end when we call RemoveOrphanMatrices(), the renumbering code will
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 9a85dc1be82..6c8d47bed40 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-optimize.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,6 +24,83 @@
 namespace kaldi {
 namespace nnet3 {
 
+void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetOptimizeOptions>");
+  ExpectToken(is, binary, "<Optimize>");
+  ReadBasicType(is, binary, &optimize);
+  ExpectToken(is, binary, "<ConsolidateModelUpdate>");
+  ReadBasicType(is, binary, &consolidate_model_update);
+  ExpectToken(is, binary, "<PropagateInPlace>");
+  ReadBasicType(is, binary, &propagate_in_place);
+  ExpectToken(is, binary, "<BackpropInPlace>");
+  ReadBasicType(is, binary, &backprop_in_place);
+  ExpectToken(is, binary, "<ConvertAddition>");
+  ReadBasicType(is, binary, &convert_addition);
+  ExpectToken(is, binary, "<RemoveAssignments>");
+  ReadBasicType(is, binary, &remove_assignments);
+  ExpectToken(is, binary, "<AllowLeftMerge>");
+  ReadBasicType(is, binary, &allow_left_merge);
+  ExpectToken(is, binary, "<AllowRightMerge>");
+  ReadBasicType(is, binary, &allow_right_merge);
+  ExpectToken(is, binary, "<InitializeUndefined>");
+  ReadBasicType(is, binary, &initialize_undefined);
+  ExpectToken(is, binary, "<MoveSizingCommands>");
+  ReadBasicType(is, binary, &move_sizing_commands);
+  ExpectToken(is, binary, "<AllocateFromOther>");
+  ReadBasicType(is, binary, &allocate_from_other);
+  ExpectToken(is, binary, "<MinDerivTime>");
+  ReadBasicType(is, binary, &min_deriv_time);
+  ExpectToken(is, binary, "<MaxDerivTime>");
+  ReadBasicType(is, binary, &max_deriv_time);
+  ExpectToken(is, binary, "</NnetOptimizeOptions>");
+}
+
+void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NnetOptimizeOptions>");
+  WriteToken(os, binary, "<Optimize>");
+  WriteBasicType(os, binary, optimize);
+  WriteToken(os, binary, "<ConsolidateModelUpdate>");
+  WriteBasicType(os, binary, consolidate_model_update);
+  WriteToken(os, binary, "<PropagateInPlace>");
+  WriteBasicType(os, binary, propagate_in_place);
+  WriteToken(os, binary, "<BackpropInPlace>");
+  WriteBasicType(os, binary, backprop_in_place);
+  WriteToken(os, binary, "<ConvertAddition>");
+  WriteBasicType(os, binary, convert_addition);
+  WriteToken(os, binary, "<RemoveAssignments>");
+  WriteBasicType(os, binary, remove_assignments);
+  WriteToken(os, binary, "<AllowLeftMerge>");
+  WriteBasicType(os, binary, allow_left_merge);
+  WriteToken(os, binary, "<AllowRightMerge>");
+  WriteBasicType(os, binary, allow_right_merge);
+  WriteToken(os, binary, "<InitializeUndefined>");
+  WriteBasicType(os, binary, initialize_undefined);
+  WriteToken(os, binary, "<MoveSizingCommands>");
+  WriteBasicType(os, binary, move_sizing_commands);
+  WriteToken(os, binary, "<AllocateFromOther>");
+  WriteBasicType(os, binary, allocate_from_other);
+  WriteToken(os, binary, "<MinDerivTime>");
+  WriteBasicType(os, binary, min_deriv_time);
+  WriteToken(os, binary, "<MaxDerivTime>");
+  WriteBasicType(os, binary, max_deriv_time);
+  WriteToken(os, binary, "</NnetOptimizeOptions>");
+}
+
+bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
+  return (other.propagate_in_place == propagate_in_place &&
+          other.optimize == optimize &&
+          other.consolidate_model_update == consolidate_model_update &&
+          other.backprop_in_place == backprop_in_place &&
+          other.convert_addition == convert_addition &&
+          other.remove_assignments == remove_assignments &&
+          other.allow_left_merge == allow_left_merge &&
+          other.allow_right_merge == allow_right_merge &&
+          other.initialize_undefined == initialize_undefined &&
+          other.move_sizing_commands == move_sizing_commands &&
+          other.allocate_from_other == allocate_from_other &&
+          other.min_deriv_time == min_deriv_time &&
+          other.max_deriv_time == max_deriv_time);
+}
 
 // move commands that resize matrices to as late/early as possible.
 void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) {
@@ -84,7 +162,6 @@ void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) {
   computation->commands = reordered_commands;
 }
 
-
 // This command replaces commands of type kAllocMatrixZeroed with commands of
 // type kAllocMatrixUndefined, where possible.
 void RemoveUnnecessaryZeroing(const Nnet &nnet,
@@ -184,7 +261,8 @@ static void ComputeCommandPairs(
 
 void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation) {
-  // For each size of matrix (i.e. each pair<int32,int32>), we
+  // For each size of matrix and stride-type, represented as a pair<int32,int32>
+  // (the num-rows, and the num-cols * (stride-type == kDefaultStride ? 1 : -1), we
   // accumulate a list of indexes of deallocation commands that
   // are for that size, and a list of indexes of allocation commands
   // for that size.
@@ -206,8 +284,10 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
         command.command_type == kAllocMatrixUndefined ||
         command.command_type == kDeallocMatrix) {
       int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
-          num_cols = computation->matrices[m].num_cols;
-      std::pair<int32,int32> p(num_rows, num_cols);
+          num_cols = computation->matrices[m].num_cols,
+          num_cols_mod = num_cols * (
+              computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
+      std::pair<int32,int32> p(num_rows, num_cols_mod);
       std::pair<std::vector<int32>,std::vector<int32> > &lists = pair_map[p];
       if (command.command_type == kDeallocMatrix)
         lists.first.push_back(command_index);
@@ -428,6 +508,41 @@ void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
                             std::make_pair(computation, ait)));
 }
 
+void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
+  NnetOptimizeOptions opt_config_cached;
+  opt_config_cached.Read(is, binary);
+  // we won't read cached computations if any optimize option has been changed.
+  bool read_cache = (opt_config_ == opt_config_cached);
+
+  if (read_cache) {
+    size_t computation_cache_size;
+    ExpectToken(is, binary, "<ComputationCacheSize>");
+    ReadBasicType(is, binary, &computation_cache_size);
+    KALDI_ASSERT(computation_cache_size >= 0);
+    computation_cache_.clear();
+    access_queue_.clear();
+    ExpectToken(is, binary, "<ComputationCache>");
+    for (size_t c = 0; c < computation_cache_size; c++) {
+      ComputationRequest *request = new ComputationRequest();
+      request->Read(is, binary);
+      NnetComputation *computation = new NnetComputation();
+      computation->Read(is, binary);
+      UpdateCache(request, computation);
+    }
+  }
+}
+
+void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ComputationCacheSize>");
+  WriteBasicType(os, binary, static_cast<int32>(computation_cache_.size()));
+  WriteToken(os, binary, "<ComputationCache>");
+  for (CacheType::const_iterator iter = computation_cache_.begin();
+           iter != computation_cache_.end(); ++iter) {
+    iter->first->Write(os, binary);
+    iter->second.first->Write(os, binary);
+  }
+}
+
 void CachingOptimizingCompiler::UpdateAccessQueue(CacheType::iterator &cit) {
   // exist, update access record by moving the accessed
   // request to the end of the access queue
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 1f8be20489b..e04aff302c9 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -99,6 +100,9 @@ struct NnetOptimizeOptions {
                    "at when updating the model.  This is an optimization that "
                    "saves time in the backprop phase for recurrent frameworks");
   }
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+  bool operator == (const NnetOptimizeOptions &other) const;
 };
 
 /// This is the top-level function for optimizing a computation.
@@ -147,6 +151,8 @@ class CachingOptimizingCompiler {
   /// It calls ComputeCudaIndexes() for you, because you wouldn't
   /// be able to do this on a const object.
   const NnetComputation* Compile(const ComputationRequest &request);
+  void ReadCache(std::istream &is, bool binary);
+  void WriteCache(std::ostream &os, bool binary) const;
  private:
   const Nnet &nnet_;
   NnetOptimizeOptions opt_config_;
@@ -168,10 +174,10 @@ class CachingOptimizingCompiler {
     ComputationRequestPtrEqual> CacheType;
   CacheType computation_cache_;
 
-  // This function updates the computation cache. It is called within
-  // Compile(). It insert the request to the end of the queue, and purge
-  // the least-recently-accessed request from the queue and the cache
-  // if the capacity is reached.
+  // This function updates the computation cache. It is called within Compile().
+  // It takes ownership of the pointers.  It inserts the request at the end of
+  // the queue, and purges the least-recently-accessed request from the queue and
+  // the cache if the capacity is reached.
   void UpdateCache(const ComputationRequest *request,
                    NnetComputation *computation);
   // This function updates the recently accessed queue.
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index c66d0235a93..f0ffb9aea8a 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -239,85 +239,88 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "</ElementwiseProductComponent>");
 }
 
-const BaseFloat NormalizeComponent::kNormFloor = pow(2.0, -66);
-// This component modifies the vector of activations by scaling it so that the
-// root-mean-square equals 1.0.  It's important that its square root
-// be exactly representable in float.
-void NormalizeComponent::Init(int32 dim, BaseFloat target_rms) {
-  KALDI_ASSERT(dim > 0);
+const BaseFloat NormalizeComponent::kSquaredNormFloor =
+    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+
+// This component modifies the vector of activations by scaling it
+// so that the root-mean-square equals 1.0.  It's important that its
+// square root be exactly representable in float.
+void NormalizeComponent::Init(int32 input_dim, BaseFloat target_rms,
+                              bool add_log_stddev) {
+  KALDI_ASSERT(input_dim > 0);
   KALDI_ASSERT(target_rms > 0);
-  dim_ = dim;
-  count_ = 0.0;
+  input_dim_ = input_dim;
   target_rms_ = target_rms;
+  add_log_stddev_ = add_log_stddev;
 }
 
+NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
+    input_dim_(other.input_dim_), target_rms_(other.target_rms_),
+    add_log_stddev_(other.add_log_stddev_) { }
+
 void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 dim = 0;
+  int32 input_dim = 0;
+  bool add_log_stddev = false;
   BaseFloat target_rms = 1.0;
-  bool ok = cfl->GetValue("dim", &dim);
+  bool ok = cfl->GetValue("dim", &input_dim) ||
+      cfl->GetValue("input-dim", &input_dim);
   cfl->GetValue("target-rms", &target_rms);
-  if (!ok || cfl->HasUnusedValues() || dim <= 0 || target_rms <= 0.0)
+  cfl->GetValue("add-log-stddev", &add_log_stddev);
+  if (!ok || cfl->HasUnusedValues() || input_dim <= 0 || target_rms <= 0.0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, target_rms);
+  Init(input_dim, target_rms, add_log_stddev);
 }
-void NormalizeComponent::Read(std::istream &is, bool binary) {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">";
-  ostr_end << "</" << Type() << ">";
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
-  ReadBasicType(is, binary, &dim_); // Read dimension.
-  std::string tok; // TODO: remove back-compatibility code.
-  ReadToken(is, binary, &tok);
 
+void NormalizeComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<NormalizeComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_); // Read dimension.
+  ReadToken(is, binary, &token);
   // read target_rms_ if it is available.
-  if (tok == "<TargetRms>") {
+  if (token == "<TargetRms>") {
     ReadBasicType(is, binary, &target_rms_);
-    ReadToken(is, binary, &tok);
+    ReadToken(is, binary, &token);
   }
-  // The new format is more readable as we write values that are normalized by
-  // the count.
-  KALDI_ASSERT(tok == "<ValueAvg>");
-  value_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<DerivAvg>");
-  deriv_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
-  value_sum_.Scale(count_);
-  deriv_sum_.Scale(count_);
-  ExpectToken(is, binary, ostr_end.str());
+  //  Read add_log_stddev_ token, if it is available.
+  if (token == "<AddLogStddev>") {
+    ReadBasicType(is, binary, &add_log_stddev_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<ValueAvg>") {
+    // back-compatibility code.
+    CuVector<double> temp;
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<DerivAvg>");
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<Count>");
+    double count;
+    ReadBasicType(is, binary, &count);
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "</NormalizeComponent>");
 }
 
 void NormalizeComponent::Write(std::ostream &os, bool binary) const {
-  std::ostringstream ostr_beg, ostr_end;
-  ostr_beg << "<" << Type() << ">";
-  ostr_end << "</" << Type() << ">";
-  WriteToken(os, binary, ostr_beg.str());
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<NormalizeComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
   WriteToken(os, binary, "<TargetRms>");
   WriteBasicType(os, binary, target_rms_);
-  // Write the values and derivatives in a count-normalized way, for
-  // greater readability in text form.
-  WriteToken(os, binary, "<ValueAvg>");
-  Vector<BaseFloat> temp(value_sum_);
-  if (count_ != 0.0) temp.Scale(1.0 / count_);
-  temp.Write(os, binary);
-  WriteToken(os, binary, "<DerivAvg>");
-
-  temp.Resize(deriv_sum_.Dim(), kUndefined);
-  temp.CopyFromVec(deriv_sum_);
-  if (count_ != 0.0) temp.Scale(1.0 / count_);
-  temp.Write(os, binary);
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
-  WriteToken(os, binary, ostr_end.str());
+  WriteToken(os, binary, "<AddLogStddev>");
+  WriteBasicType(os, binary, add_log_stddev_);
+  WriteToken(os, binary, "</NormalizeComponent>");
 }
 
 std::string NormalizeComponent::Info() const {
   std::ostringstream stream;
-  stream << NonlinearComponent::Info();
-  stream << ", target-rms=" << target_rms_;
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
+         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
   return stream.str();
 }
 
@@ -328,24 +331,34 @@ std::string NormalizeComponent::Info() const {
 // there is also flooring involved, to avoid division-by-zero
 // problems.  It's important for the backprop, that the floor's
 // square root is exactly representable as float.
+// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
 void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                    const CuMatrixBase<BaseFloat> &in,
                                    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(out->NumCols() == in.NumCols() + (add_log_stddev_ ? 1 : 0));
+  CuSubMatrix<BaseFloat> out_no_log(*out, 0, out->NumRows(), 0, input_dim_);
+  if (in.Data() != out_no_log.Data())
+    out_no_log.CopyFromMat(in);
   CuVector<BaseFloat> in_norm(in.NumRows());
-  BaseFloat d_scaled = (in.NumCols() * target_rms_ * target_rms_);
-  in_norm.AddDiagMat2(1.0 / d_scaled,
-                      in, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
+  BaseFloat d_scaled = in.NumCols() * target_rms_ * target_rms_;
+  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+  in_norm.ApplyFloor(kSquaredNormFloor);
   in_norm.ApplyPow(-0.5);
-  out->CopyFromMat(in);
-  out->MulRowsVec(in_norm);
+  out_no_log.MulRowsVec(in_norm);
+  if (add_log_stddev_) {
+    in_norm.ApplyLog();
+    in_norm.Scale(-1.0);
+    in_norm.Add(log(target_rms_));
+    out->CopyColFromVec(in_norm, in.NumCols());
+  }
 }
 
 /*
   A note on the derivative of NormalizeComponent...
   let both row_in and row_out be vectors of dimension D.
   Let p = row_in^T row_in / (D * target_rms^2), and let
-  f = 1.0 / sqrt(max(kNormFloor, p)), and we compute row_out as:
+  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
   row_out = f row_in.
   Suppose we have a quantity deriv_out which is the derivative
   of the objective function w.r.t. row_out.  We want to compute
@@ -354,13 +367,15 @@ void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   deriv_in = f deriv_out + ....
   next we have to take into account the derivative that gets back-propagated
   through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kNormFloor) ? 0.0 : -0.5 f^3),
+  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
   and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
   So this term in dF/d(row_in) equals:
-  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
   So
   deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
 
+  if add_log_stddev_ true, the deriv_in has another term as
+  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
 */
 void NormalizeComponent::Backprop(const std::string &debug_info,
                                   const ComponentPrecomputedIndexes *indexes,
@@ -370,28 +385,47 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   Component *to_update,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
+  const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
+                                                0, out_deriv.NumRows(),
+                                                0, input_dim_);
   CuVector<BaseFloat> dot_products(out_deriv.NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv, kNoTrans, in_value, kTrans, 0.0);
+  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
+                             in_value, kTrans, 0.0);
   CuVector<BaseFloat> in_norm(in_value.NumRows());
-  // dscaled == D * target_rms^2.
   BaseFloat d_scaled = (in_value.NumCols() * target_rms_ * target_rms_);
-  in_norm.AddDiagMat2(1.0 / d_scaled,
-                      in_value, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
+  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+
+  if (add_log_stddev_) {
+    CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+    // f = log(sqrt(max(epsi, x^T x / D)))
+    // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+    // we don't compute this exactly below for the case wehn x^2 x is very
+    // small, but we do make sure that the deriv isn't infinity when the input
+    // is zero.
+    log_stddev_deriv.ApplyFloor(input_dim_ * kSquaredNormFloor);
+    log_stddev_deriv.ApplyPow(-1.0);
+    out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
+    log_stddev_deriv.MulElements(out_deriv_for_stddev);
+    if (in_deriv)
+      in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
+  }
+  in_norm.Scale(1.0 / d_scaled);
+  in_norm.ApplyFloor(kSquaredNormFloor);
   in_norm.ApplyPow(-0.5);
-
   if (in_deriv) {
-    if (in_deriv->Data() != out_deriv.Data())
-      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv, kNoTrans, 0.0);
+    if (in_deriv->Data() != out_deriv_no_log.Data())
+      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
     else
       in_deriv->MulRowsVec(in_norm);
+    in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+    in_norm.ApplyPow(3.0);
+    dot_products.MulElements(in_norm);
+
+    in_deriv->AddDiagVecMat(-1.0 / d_scaled,
+                            dot_products, in_value,
+                            kNoTrans, 1.0);
   }
-  in_norm.ReplaceValue(1.0 / sqrt(kNormFloor), 0.0);
-  in_norm.ApplyPow(3.0);
-  dot_products.MulElements(in_norm);
-  in_deriv->AddDiagVecMat(-1.0 / d_scaled,
-                          dot_products, in_value,
-                          kNoTrans, 1.0);
 }
 
 void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -407,11 +441,85 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
                                 const CuMatrixBase<BaseFloat> &out_deriv,
                                 Component *,
                                 CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (in_deriv != NULL)
+  if (in_deriv != NULL) {
     in_deriv->DiffSigmoid(out_value, out_deriv);
+    RepairGradients(out_value, in_deriv);
+  }
+}
+
+void SigmoidComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 0.25.
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.05, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.05;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add
+  // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
+  // to the input derivative for each problematic dimension.
+
+  // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to
+  // 1.0, like a tanh.  the negative sign is so that for inputs <0, we push them
+  // up towards 0, and for inputs >0, we push them down towards 0.
+  // Our use of this sigmoid-type function here is just a convenience since
+  // we have it available.  We could use just about any function that is positive
+  // for inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output
+  //   input-deriv +=  self-repair-scale / repair-probabilty
+  // which we can write as:
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
+  //   input-deriv +=  self-repair-scale / repair-probabilty * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
+                         thresholds_vec);
 }
 
+
+
 void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
   // derivative of the nonlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
                                  kUndefined);
@@ -590,6 +698,68 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   out->Tanh(in);
 }
 
+
+void TanhComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 1.0
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.2, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.2;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add -self_repair_scale_ / repair_probability times
+  // output-valiue) to the input derivative for each problematic dimension.
+  // note that for the tanh, the output-value goes from -1.0 when the input is
+  // -inf to +1.0 when the input is +inf.  The negative sign is so that for
+  // inputs <0, we push them up towards 0, and for inputs >0, we push them down
+  // towards 0.  Our use of the tanh here is just a convenience since we have it
+  // available.  We could use just about any function that is positive for
+  // inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= self-repair-scale / repair-probabilty * output
+  // which we can write as:
+  //   input-deriv -=  self-repair-scale / repair-probabilty * output * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+}
+
 void TanhComponent::Backprop(const std::string &debug_info,
                              const ComponentPrecomputedIndexes *indexes,
                              const CuMatrixBase<BaseFloat> &,
@@ -598,8 +768,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
                              Component *to_update, // may be NULL; may be identical
                              // to "this" or different.
                              CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (in_deriv != NULL)
+  if (in_deriv != NULL) {
     in_deriv->DiffTanh(out_value, out_deriv);
+    RepairGradients(out_value, in_deriv);
+  }
 }
 
 /*
@@ -610,6 +782,9 @@ void TanhComponent::Backprop(const std::string &debug_info,
   in_deriv = out_deriv * (1.0 - out_value^2).
   We can accomplish this via calls to the matrix library. */
 void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
   // derivative of the onlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value);
   temp_deriv.ApplyPow(2.0);
@@ -618,7 +793,6 @@ void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
   StoreStatsInternal(out_value, &temp_deriv);
 }
 
-
 void RectifiedLinearComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
     const CuMatrixBase<BaseFloat> &in,
@@ -637,16 +811,77 @@ void RectifiedLinearComponent::Backprop(
     Component *to_update,
     CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
-    in_deriv->CopyFromMat(out_value);
-    in_deriv->ApplyHeaviside();
+    in_deriv->Heaviside(out_value);
     in_deriv->MulElements(out_deriv);
+    RepairGradients(in_deriv);
   }
 }
 
+
+void RectifiedLinearComponent::RepairGradients(
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  BaseFloat default_lower_threshold = 0.05,
+      default_upper_threshold = 0.95;
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_,
+      upper_threshold = (self_repair_upper_threshold_ == unset ?
+                         default_upper_threshold :
+                         self_repair_upper_threshold_) *
+      count_;
+
+  CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
+  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
+  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, dim_);
+  thresholds_vec(0) = -lower_threshold;
+  thresholds_vec(1) = -upper_threshold;
+  CuSubVector<BaseFloat> row0(stats_mat, 0);
+  CuSubVector<BaseFloat> row1(stats_mat, 1);
+
+  row0.CopyFromVec(deriv_sum_);
+  row1.CopyFromVec(row0);
+  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
+  // now row0 equals stats - lower_threshold, and
+  //     row1 equals stats - upper_threshold.
+  stats_mat.ApplyHeaviside();
+  // now row0 equals (stats > lower_threshold ? 1 : 0), and
+  //     row1 equals (stats > upper_threshold ? 1 : 0).
+  // what we want is:
+  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
+  //                         (stats > upper_threshold ? -1 : 0)).
+  //
+  // we can get these in stats_mat.Row(0) by computing:
+  // -self_repair_scale * (stats_mat.Row(1)  + stats_mat.Row(0) - 1).
+  row0.AddVec(1.0, row1, 1.0);
+  row0.Add(-1.0);
+  // [actually we need to divide by repair_probability also, to
+  //  correct for the fact that we only do this on some frames.]
+  row0.Scale(-self_repair_scale_ / repair_probability);
+  in_deriv->AddVecToRows(1.0, row0, 1.0);
+}
+
+
 void RectifiedLinearComponent::StoreStats(
     const CuMatrixBase<BaseFloat> &out_value) {
-  CuMatrix<BaseFloat> temp_deriv(out_value);
-  temp_deriv.ApplyHeaviside();
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
+  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
+                                 out_value.NumCols(),
+                                 kUndefined);
+  temp_deriv.Heaviside(out_value);
   StoreStatsInternal(out_value, &temp_deriv);
 }
 
@@ -679,7 +914,7 @@ AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  BaseFloat learning_rate):
     linear_params_(linear_params),
     bias_params_(bias_params) {
-  SetLearningRate(learning_rate);
+  SetUnderlyingLearningRate(learning_rate);
   KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
                bias_params.Dim() != 0);
 }
@@ -688,8 +923,7 @@ AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
 
 void AffineComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   linear_params_.SetZero();
@@ -955,8 +1189,7 @@ void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
 
 void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   linear_params_.SetZero();
@@ -1039,39 +1272,26 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
 void RepeatedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                         const CuMatrixBase<BaseFloat> &in,
                                         CuMatrixBase<BaseFloat> *out) const {
+  // we gave the kInputContiguous and kOutputContiguous flags-- check that they
+  // are honored.
+  KALDI_ASSERT(in.NumCols() == in.Stride() &&
+               out->NumCols() == out->Stride() &&
+               out->NumRows() == in.NumRows());
 
-  // No need for asserts as they'll happen within the matrix operations.
-  int32 block_rows = linear_params_.NumRows();
-  int32 block_cols = linear_params_.NumCols();
-
-  {
-    // copy bias_params_ to each row  of *out
-    CuVector<BaseFloat> tmp_repeated_bias(num_repeats_ * bias_params_.Dim(),
-                                          kUndefined);
-    CuSubMatrix<BaseFloat> tmp_bias_mat(tmp_repeated_bias.Data(), num_repeats_,
-                                        bias_params_.Dim(), bias_params_.Dim());
-    tmp_bias_mat.CopyRowsFromVec(bias_params_);
-    out->CopyRowsFromVec(tmp_repeated_bias);
-  }
+  int32 num_repeats = num_repeats_,
+      num_rows = in.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
 
-  std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
-      linear_params_batch;
-  CuSubMatrix<BaseFloat>* params_elem =
-      new CuSubMatrix<BaseFloat>(linear_params_.ColRange(0, block_cols));
-
-  //split the in and out mat into blocks.
-  for (int i = 0; i < num_repeats_; i++) {
-    in_batch.push_back(new CuSubMatrix<BaseFloat>(in.ColRange(
-	                   i * block_cols, block_cols)));
-    out_batch.push_back(new CuSubMatrix<BaseFloat>(
-	                    out->ColRange(i * block_rows, block_rows)));
-    linear_params_batch.push_back(params_elem);
-  }
-  AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
-                              linear_params_batch, kTrans, 1.0);
-  delete params_elem;
-  DeletePointers(&in_batch);
-  DeletePointers(&out_batch);
+  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
+                                     block_dim_in, block_dim_in),
+      out_reshaped(out->Data(), num_rows * num_repeats,
+                   block_dim_out, block_dim_out);
+
+  out_reshaped.CopyRowsFromVec(bias_params_);
+
+  out_reshaped.AddMatMat(1.0, in_reshaped, kNoTrans,
+                         linear_params_, kTrans, 1.0);
 }
 
 void RepeatedAffineComponent::Backprop(const std::string &debug_info,
@@ -1081,34 +1301,31 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
                                        const CuMatrixBase<BaseFloat> &out_deriv,
                                        Component *to_update_in,
                                        CuMatrixBase<BaseFloat> *in_deriv) const {
-  RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(to_update_in);
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+       (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
+               (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
+
+  RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
+      to_update_in);
 
   // Propagate the derivative back to the input.
   // add with coefficient 1.0 since property kBackpropAdds is true.
   // If we wanted to add with coefficient 0.0 we'd need to zero the
   // in_deriv, in case of infinities.
   if (in_deriv) {
-    int32 block_rows = linear_params_.NumRows();
-	int32 block_cols = linear_params_.NumCols();
-	std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch,
-        linear_params_batch;
-	CuSubMatrix<BaseFloat>* params_elem =
-	    new CuSubMatrix<BaseFloat>(linear_params_.ColRange(0, block_cols));
-
-    //split the out_deriv and the in_deriv mat into blocks.
-    for (int i = 0; i < num_repeats_; i++) {
-      in_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(in_deriv->ColRange(
-	                           i * block_cols, block_cols)));
-      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-	                            i * block_rows, block_rows)));
-      linear_params_batch.push_back(params_elem);
-    }
-
-    AddMatMatBatched<BaseFloat>(1.0, in_deriv_batch, out_deriv_batch, kNoTrans,
-                                linear_params_batch, kNoTrans, 1.0);
-    delete params_elem;
-    DeletePointers(&in_deriv_batch);
-    DeletePointers(&out_deriv_batch);
+    int32 num_repeats = num_repeats_,
+        num_rows = out_deriv.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
+
+    CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+    in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
+                                linear_params_, kNoTrans, 1.0);
   }
 
   // Next update the model (must do this 2nd so the derivatives we propagate are
@@ -1119,73 +1336,28 @@ void RepeatedAffineComponent::Backprop(const std::string &debug_info,
 
 void RepeatedAffineComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
                                      const CuMatrixBase<BaseFloat> &out_deriv) {
-  int32 block_rows = linear_params_.NumRows();
-  int32 block_cols = linear_params_.NumCols();
-  std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
-      out_deriv_batch, linear_params_deriv_batch;
-  CuMatrix<BaseFloat> linear_params_deriv_repeated(
-      block_rows * num_repeats_, block_cols);
-
-  for (int i = 0; i < num_repeats_; i++) {
-    in_value_batch.push_back(new CuSubMatrix<BaseFloat>(in_value.ColRange(
-        i * block_cols, block_cols)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        i * block_rows, block_rows)));
-    linear_params_deriv_batch.push_back(
-        new CuSubMatrix<BaseFloat>(linear_params_deriv_repeated.RowRange(
-            i * block_rows, block_rows)));
-  }
-  AddMatMatBatched<BaseFloat>(learning_rate_,
-                              linear_params_deriv_batch,
-                              out_deriv_batch, kTrans,
-                              in_value_batch, kNoTrans, 0.0);
-
-  { // sum up the repeated blocks of pieces of derivative (from
-    // linear_params_deriv_repeated) into linear_params_.  We do this without
-    // the use of loops by means of a little fakery, viewing
-    // linear_params_deriv_repeated as a matrix with a larger stride than it
-    // really has to sum up its rows.  This may generate some spurious
-    // valgrind/memcheck warnings due to accessing the memory in between rows
-    // of a matrix that has been allocated using cudaMallocPitch.
-    // view linear_params_deriv_repeated as a matrix where each row
-    // corresponds to one repeat.
-
-    // usage of linear_params_deriv_repeated.Stride() instead of
-    // linear_params_.Stride() is important here. CUDA does not guarantee
-    // that two matrices of the same number of columns
-    // (linear_params_deriv_repeated and linear_params_) have the same stride.
-    // i.e., we cannot reliably do:
-    // KALDI_ASSERT(linear_params_.Stride() == linear_params_deriv_repeated.Stride())
-    int32 size_as_vector =
-        (linear_params_.NumRows() - 1) * linear_params_deriv_repeated.Stride() +
-        linear_params_.NumCols(),
-        stride_as_matrix = linear_params_.NumRows() * linear_params_deriv_repeated.Stride();
-    CuSubMatrix<BaseFloat> linear_params_deriv_repeated_as_mat(
-        linear_params_deriv_repeated.Data(), num_repeats_,
-        size_as_vector, stride_as_matrix);
-    // add all remaining rows to the first row, of
-    // linear_params_deriv_repeated_as_mat
-    if (num_repeats_ > 1)
-      linear_params_deriv_repeated_as_mat.Row(0).AddRowSumMat(
-          1.0, linear_params_deriv_repeated_as_mat.RowRange(1,
-                                                            num_repeats_ - 1));
-    linear_params_.AddMat(1.0, linear_params_deriv_repeated.RowRange(
-        0, linear_params_.NumRows()));
-  }
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+               in_value.NumCols() == in_value.Stride() &&
+               in_value.NumRows() == out_deriv.NumRows());
 
-  { // deal with the derivative w.r.t. the bias.
-    CuVector<BaseFloat> repeated_bias_deriv(
-        num_repeats_ * bias_params_.Dim());
-    repeated_bias_deriv.AddRowSumMat(1.0, out_deriv, 0.0);
-    CuSubMatrix<BaseFloat> bias_deriv_mat(repeated_bias_deriv.Data(),
-                                          num_repeats_, bias_params_.Dim(), // num rows, num cols
-                                          bias_params_.Dim()); // stride
-    bias_params_.AddRowSumMat(learning_rate_,
-                              bias_deriv_mat);
-  }
-  DeletePointers(&in_value_batch);
-  DeletePointers(&out_deriv_batch);
-  DeletePointers(&linear_params_deriv_batch);
+
+    int32 num_repeats = num_repeats_,
+        num_rows = in_value.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
+
+    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+
+
+  linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
+                           in_value_reshaped, kNoTrans, 1.0);
+  bias_params_.AddRowSumMat(learning_rate_,
+                            out_deriv_reshaped);
 }
 
 void RepeatedAffineComponent::Read(std::istream &is, bool binary) {
@@ -1262,105 +1434,53 @@ Component* NaturalGradientRepeatedAffineComponent::Copy() const {
 void NaturalGradientRepeatedAffineComponent::Update(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_deriv) {
-  int32 block_rows = linear_params_.NumRows();
-  int32 block_cols = linear_params_.NumCols();
-  std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
-      out_deriv_batch, linear_params_deriv_batch;
-
-  // params_deriv_repeated as as linear_params_deriv_repeated below,
-  // but with an extra column which will come in useful when dealing with
-  // the biases.
-  CuMatrix<BaseFloat> params_deriv_repeated(
-      block_rows * num_repeats_, block_cols + 1);
-  CuSubMatrix<BaseFloat> linear_params_deriv_repeated(
-      params_deriv_repeated, 0, params_deriv_repeated.NumRows(),
-      0, block_cols);
-  // params_deriv will becomes the derivative w.r.t the parameters,
-  // with the derivative w.r.t. the bias as the last column.
-  CuSubMatrix<BaseFloat> params_deriv(params_deriv_repeated, 0, block_rows,
-                                      0, block_cols + 1);
-
-  for (int i = 0; i < num_repeats_; i++) {
-    in_value_batch.push_back(new CuSubMatrix<BaseFloat>(in_value.ColRange(
-        i * block_cols, block_cols)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-        i * block_rows, block_rows)));
-    linear_params_deriv_batch.push_back(
-        new CuSubMatrix<BaseFloat>(linear_params_deriv_repeated.RowRange(
-            i * block_rows, block_rows)));
-  }
-  AddMatMatBatched<BaseFloat>(1.0,
-                              linear_params_deriv_batch,
-                              out_deriv_batch, kTrans,
-                              in_value_batch, kNoTrans, 0.0);
-
-  { // sum up the repeated blocks of pieces of derivative (from
-    // linear_params_deriv_repeated) into the first block
-    // of linear_params_deriv_repeated.  We do this without
-    // the use of loops by means of a little fakery, viewing
-    // linear_params_deriv_repeated as a matrix with a larger stride than it
-    // really has to sum up its rows.  This may generate some spurious
-    // valgrind/memcheck warnings due to accessing the memory in between rows
-    // of a matrix that has been allocated using cudaMallocPitch.
-    // view linear_params_deriv_repeated as a matrix where each row
-    // corresponds to one repeat.
-
-    // usage of linear_params_deriv_repeated.Stride() instead of
-    // linear_params_.Stride() is important here. CUDA does not guarantee
-    // that two matrices of the same number of columns
-    // (linear_params_deriv_repeated and linear_params_) have the same stride.
-    // i.e., we cannot reliably do:
-    // KALDI_ASSERT(linear_params_.Stride() == linear_params_deriv_repeated.Stride())
-    int32 size_as_vector =
-        (linear_params_.NumRows() - 1) * linear_params_deriv_repeated.Stride() +
-        linear_params_.NumCols(),
-        stride_as_matrix = linear_params_.NumRows() * linear_params_deriv_repeated.Stride();
-    CuSubMatrix<BaseFloat> linear_params_deriv_repeated_as_mat(
-        linear_params_deriv_repeated.Data(), num_repeats_,
-        size_as_vector, stride_as_matrix);
-    // add all remaining rows to the first row of
-    // linear_params_deriv_repeated_as_mat [i.e. add all but the 1st block to the 1st block]
-    if (num_repeats_ > 1)
-      linear_params_deriv_repeated_as_mat.Row(0).AddRowSumMat(
-          1.0, linear_params_deriv_repeated_as_mat.RowRange(1,
-                                                            num_repeats_ - 1));
-  }
-
-  CuVector<BaseFloat> repeated_bias_deriv(
-      num_repeats_ * bias_params_.Dim());
-  { // deal with the derivative w.r.t. the bias.
-    repeated_bias_deriv.AddRowSumMat(1.0, out_deriv, 0.0);
-    CuSubMatrix<BaseFloat> bias_deriv_mat(repeated_bias_deriv.Data(),
-                                          num_repeats_, bias_params_.Dim(), // num rows, num cols
-                                          bias_params_.Dim()); // stride
-    // Add all but the first row of bias_deriv_mat, to the first row.
-    if (num_repeats_ > 1)
-      bias_deriv_mat.Row(0).AddRowSumMat(
-          1.0, bias_deriv_mat.RowRange(1, num_repeats_ - 1));
-
-    // copy the summed bias-derivative to the last column of the matrix
-    // 'params_deriv'.
-    params_deriv.CopyColFromVec(bias_deriv_mat.Row(0), block_cols);
-  }
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+               in_value.NumCols() == in_value.Stride() &&
+               in_value.NumRows() == out_deriv.NumRows());
+
+  int32 num_repeats = num_repeats_,
+      num_rows = in_value.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
+
+  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+                                           num_rows * num_repeats,
+                                           block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+
+  CuVector<BaseFloat> bias_deriv(block_dim_out);
+  bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
+
+  CuMatrix<BaseFloat> deriv(block_dim_out,
+                            block_dim_in + 1);
+  deriv.ColRange(0, block_dim_in).AddMatMat(
+      1.0, out_deriv_reshaped, kTrans,
+      in_value_reshaped, kNoTrans, 1.0);
+  deriv.CopyColFromVec(bias_deriv, block_dim_in);
 
   BaseFloat scale = 1.0;
   if (!is_gradient_) {
-    // Only apply the preconditioning/natural-gradient if we're not computing
-    // the exact gradient.
-    preconditioner_in_.PreconditionDirections(&params_deriv, NULL, &scale);
+    try {
+      // Only apply the preconditioning/natural-gradient if we're not computing
+      // the exact gradient.
+      preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale);
+    } catch (...) {
+      int32 num_bad_rows = 0;
+      for (int32 i = 0; i < out_deriv.NumRows(); i++) {
+        BaseFloat f = out_deriv.Row(i).Sum();
+        if (!(f - f == 0)) num_bad_rows++;
+      }
+      KALDI_ERR << "Preonditioning failed, in_value sum is "
+                << in_value.Sum() << ", out_deriv sum is " << out_deriv.Sum()
+                << ", out_deriv has " << num_bad_rows << " bad rows.";
+    }
   }
   linear_params_.AddMat(learning_rate_ * scale,
-                        params_deriv.ColRange(0, block_cols));
-  // there is no function to add a column of a matrix to a vector, so use a
-  // temporary vector [re-using some memory we already allocated.]
-  CuSubVector<BaseFloat> bias_direction(repeated_bias_deriv.Data(),
-                                        bias_params_.Dim());
-  bias_direction.CopyColFromMat(params_deriv, block_cols);
-  bias_params_.AddVec(learning_rate_ * scale, bias_direction);
-
-  DeletePointers(&in_value_batch);
-  DeletePointers(&out_deriv_batch);
-  DeletePointers(&linear_params_deriv_batch);
+                        deriv.ColRange(0, block_dim_in));
+  bias_deriv.CopyColFromMat(deriv, block_dim_in);
+  bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
 }
 
 BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
@@ -1369,6 +1489,25 @@ BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
   bias_params_(other.bias_params_),
   num_blocks_(other.num_blocks_) {}
 
+BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) :
+  UpdatableComponent(rac),
+  linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(),
+                 rac.linear_params_.NumCols(), kUndefined),
+  bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined),
+  num_blocks_(rac.num_repeats_) {
+  // copy rac's linear_params_ and bias_params_ to this.
+  int32 num_rows_in_block = rac.linear_params_.NumRows();
+  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    int32 row_offset = block_counter * num_rows_in_block;
+    CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
+                                                                 num_rows_in_block);
+    block.CopyFromMat(rac.linear_params_);
+    CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
+                                                                 num_rows_in_block);
+    block_bias.CopyFromVec(rac.bias_params_);
+  }
+}
+
 Component* BlockAffineComponent::Copy() const {
   BlockAffineComponent *ans = new BlockAffineComponent(*this);
   return ans;
@@ -1557,8 +1696,7 @@ void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
 
 void BlockAffineComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   linear_params_.SetZero();
@@ -1647,8 +1785,7 @@ PerElementScaleComponent::PerElementScaleComponent(
 
 void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   scales_.SetZero();
@@ -1670,11 +1807,7 @@ std::string PerElementScaleComponent::Info() const {
 }
 
 Component* PerElementScaleComponent::Copy() const {
-  PerElementScaleComponent *ans = new PerElementScaleComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->scales_ = scales_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
+  return new PerElementScaleComponent(*this);
 }
 
 BaseFloat PerElementScaleComponent::DotProduct(
@@ -1816,8 +1949,7 @@ PerElementOffsetComponent::PerElementOffsetComponent(
 
 void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   offsets_.SetZero();
@@ -1839,11 +1971,7 @@ std::string PerElementOffsetComponent::Info() const {
 }
 
 Component* PerElementOffsetComponent::Copy() const {
-  PerElementOffsetComponent *ans = new PerElementOffsetComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->offsets_ = offsets_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
+  return new PerElementOffsetComponent(*this);
 }
 
 BaseFloat PerElementOffsetComponent::DotProduct(
@@ -1951,6 +2079,166 @@ void PerElementOffsetComponent::UnVectorize(
   offsets_.CopyFromVec(params);
 }
 
+std::string ConstantFunctionComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim()
+         << ", is-updatable=" << std::boolalpha << is_updatable_
+         << ", use-natural-gradient=" << std::boolalpha
+         << use_natural_gradient_;
+  PrintParameterStats(stream, "output", output_, true);
+  return stream.str();
+}
+
+ConstantFunctionComponent::ConstantFunctionComponent():
+    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
+
+ConstantFunctionComponent::ConstantFunctionComponent(
+    const ConstantFunctionComponent &other):
+    input_dim_(other.input_dim_), output_(other.output_),
+    is_updatable_(other.is_updatable_),
+    use_natural_gradient_(other.use_natural_gradient_),
+    preconditioner_(other.preconditioner_) { }
+
+void ConstantFunctionComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyRowsFromVec(output_);
+}
+
+void ConstantFunctionComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // we don't update in_deriv, since we set the flag
+  // kBackpropAdds, and the output doesn't depend on the
+  // input, so the input-derivative is zero.
+  if (to_update_in) {
+    ConstantFunctionComponent *to_update =
+      dynamic_cast<ConstantFunctionComponent*>(to_update_in);
+    if (to_update->is_updatable_) {
+      // only do the update if the is_updatable_ flag is set.
+      KALDI_ASSERT(to_update && to_update->is_updatable_);
+      if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
+        CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
+        BaseFloat scale = 1.0;
+        to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
+                                                          NULL, &scale);
+        to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
+                                        out_deriv_copy);
+      } else {
+        to_update->output_.AddRowSumMat(to_update->learning_rate_,
+                                        out_deriv);
+      }
+    }
+  }
+}
+
+void ConstantFunctionComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<ConstantFunctionComponent>",
+                       "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<Output>");
+  output_.Read(is, binary);
+  ExpectToken(is, binary, "<IsUpdatable>");
+  ReadBasicType(is, binary, &is_updatable_);
+  ExpectToken(is, binary, "<UseNaturalGradient>");
+  ReadBasicType(is, binary, &use_natural_gradient_);
+  ExpectToken(is, binary, "</ConstantFunctionComponent>");
+}
+
+void ConstantFunctionComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ConstantFunctionComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<Output>");
+  output_.Write(os, binary);
+  WriteToken(os, binary, "<IsUpdatable>");
+  WriteBasicType(os, binary, is_updatable_);
+  WriteToken(os, binary, "<UseNaturalGradient>");
+  WriteBasicType(os, binary, use_natural_gradient_);
+  WriteToken(os, binary, "</ConstantFunctionComponent>");
+}
+
+Component* ConstantFunctionComponent::Copy() const {
+  return new ConstantFunctionComponent(*this);
+}
+
+void ConstantFunctionComponent::Scale(BaseFloat scale) {
+  if (is_updatable_)
+    output_.Scale(scale);
+}
+
+void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
+  if (is_updatable_) {
+    const ConstantFunctionComponent *other =
+        dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+    KALDI_ASSERT(other != NULL);
+    output_.AddVec(alpha, other->output_);
+  }
+}
+
+void ConstantFunctionComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  output_.SetZero();
+}
+
+void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
+  temp_output.SetRandn();
+  output_.AddVec(stddev, temp_output);
+}
+
+BaseFloat ConstantFunctionComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  KALDI_ASSERT(is_updatable_);
+  const ConstantFunctionComponent *other =
+      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(output_, other->output_);
+}
+
+void ConstantFunctionComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 output_dim = 0;
+  InitLearningRatesFromConfig(cfl);
+  bool ok = cfl->GetValue("output-dim", &output_dim) &&
+      cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("is-updatable", &is_updatable_);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
+  BaseFloat output_mean = 0.0, output_stddev = 0.0;
+  cfl->GetValue("output-mean", &output_mean);
+  cfl->GetValue("output-stddev", &output_stddev);
+  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 ||
+      output_dim <= 0) {
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  }
+  Vector<BaseFloat> output(output_dim);
+  output.SetRandn();
+  output.Scale(output_stddev);
+  output.Add(output_mean);
+  output_ = output;
+}
+
+int32 ConstantFunctionComponent::NumParameters() const {
+  KALDI_ASSERT(is_updatable_);
+  return output_.Dim();
+}
+
+void ConstantFunctionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  params->CopyFromVec(output_);
+}
+
+void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  output_.CopyFromVec(params);
+}
 
 
 NaturalGradientAffineComponent::NaturalGradientAffineComponent():
@@ -2930,7 +3218,7 @@ ConvolutionComponent::ConvolutionComponent(
   KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
                bias_params.Dim() != 0);
   KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
-  SetLearningRate(learning_rate);
+  SetUnderlyingLearningRate(learning_rate);
   is_gradient_ = false;
 }
 
@@ -3014,7 +3302,7 @@ std::string ConvolutionComponent::Info() const {
          << ", input-vectorization=" << input_vectorization_
          << ", num-filters=" << filter_params_.NumRows();
   PrintParameterStats(stream, "filter-params", filter_params_);
-  PrintParameterStats(stream, "bias-params", bias_params_, true);  
+  PrintParameterStats(stream, "bias-params", bias_params_, true);
   return stream.str();
 }
 
@@ -3436,8 +3724,7 @@ void ConvolutionComponent::Update(const std::string &debug_info,
 
 void ConvolutionComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   filter_params_.SetZero();
@@ -3552,645 +3839,187 @@ void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
 }
 
-Convolutional1dComponent::Convolutional1dComponent():
-    UpdatableComponent(),
-    patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
-    UpdatableComponent(component),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_),
-    is_gradient_(component.is_gradient_) {}
-
-Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                                                   const CuVectorBase<BaseFloat> &bias_params,
-                                                   BaseFloat learning_rate):
-    filter_params_(filter_params),
-    bias_params_(bias_params) {
-  SetLearningRate(learning_rate);
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  is_gradient_ = false;
-}
-
 // aquire input dim
-int32 Convolutional1dComponent::InputDim() const {
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_splice = filter_dim / patch_dim_;
-  return patch_stride_ * num_splice;
-}
-
-// aquire output dim
-int32 Convolutional1dComponent::OutputDim() const {
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  return num_patches * num_filters;
-}
-
-// initialize the component using hyperparameters
-void Convolutional1dComponent::Init(int32 input_dim, int32 output_dim,
-                                    int32 patch_dim, int32 patch_step, int32 patch_stride,
-                                    BaseFloat param_stddev, BaseFloat bias_stddev) {
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  int32 num_splice = input_dim / patch_stride;
-  int32 filter_dim = num_splice * patch_dim;
-  int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride == 0);
-  KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void Convolutional1dComponent::Init(int32 patch_dim, int32 patch_step, int32 patch_stride,
-                                    std::string matrix_filename) {
-  patch_dim_ = patch_dim;
-  patch_step_ = patch_step;
-  patch_stride_ = patch_stride;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  KALDI_ASSERT(mat.NumCols() >= 2);
-  int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows();
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// resize the component, setting the parameters to zero, while
-// leaving any other configuration values the same
-void Convolutional1dComponent::Resize(int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 0 && output_dim > 0);
-  int32 num_splice = input_dim / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = output_dim / num_patches;
-  KALDI_ASSERT(input_dim % patch_stride_ == 0);
-  KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0);
-  KALDI_ASSERT(output_dim % num_patches == 0);
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
+int32 MaxpoolingComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
 }
 
-// display information about component
-std::string Convolutional1dComponent::Info() const {
-  std::ostringstream stream;
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 filter_dim = num_splice * patch_dim_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = OutputDim() / num_patches;
+MaxpoolingComponent::MaxpoolingComponent(
+    const MaxpoolingComponent &component):
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    pool_x_size_(component.pool_x_size_),
+    pool_y_size_(component.pool_y_size_),
+    pool_z_size_(component.pool_z_size_),
+    pool_x_step_(component.pool_x_step_),
+    pool_y_step_(component.pool_y_step_),
+    pool_z_step_(component.pool_z_step_) { }
 
-  stream << UpdatableComponent::Info()
-         << ", num-splice=" << num_splice
-         << ", num-patches=" << num_patches
-         << ", num-filters=" << num_filters
-         << ", filter-dim=" << filter_dim;
-  PrintParameterStats(stream, "filter-params", filter_params_);
-  PrintParameterStats(stream, "bias-params", bias_params_, true);  
-  return stream.str();
+// aquire output dim
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+  return num_pools_x * num_pools_y * num_pools_z;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_x_dim_ > 0);
+  KALDI_ASSERT(input_y_dim_ > 0);
+  KALDI_ASSERT(input_z_dim_ > 0);
+  KALDI_ASSERT(pool_x_size_ > 0);
+  KALDI_ASSERT(pool_y_size_ > 0);
+  KALDI_ASSERT(pool_z_size_ > 0);
+  KALDI_ASSERT(pool_x_step_ > 0);
+  KALDI_ASSERT(pool_y_step_ > 0);
+  KALDI_ASSERT(pool_z_step_ > 0);
+  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
+  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
+  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
+  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
+  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
+  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
+  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
+  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
+  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
 }
 
 // initialize the component using configuration file
-void Convolutional1dComponent::InitFromConfig(ConfigLine *cfl) {
-  KALDI_WARN << "Convolutional1dComponent has been deprecated."
-             << " Please use ConvolutionComponent.";
+void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
-  std::string matrix_filename;
-  int32 input_dim = -1, output_dim = -1;
-  int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
-  InitLearningRatesFromConfig(cfl);
-  ok = ok && cfl->GetValue("patch-dim", &patch_dim);
-  ok = ok && cfl->GetValue("patch-step", &patch_step);
-  ok = ok && cfl->GetValue("patch-stride", &patch_stride);
-  if (cfl->GetValue("matrix", &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(patch_dim, patch_step, patch_stride, matrix_filename);
-    if (cfl->GetValue("input-dim", &input_dim))
-      KALDI_ASSERT(input_dim == InputDim() &&
-               "input-dim mismatch vs. matrix.");
-    if (cfl->GetValue("output-dim", &output_dim))
-      KALDI_ASSERT(output_dim == OutputDim() &&
-               "output-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && cfl->GetValue("input-dim", &input_dim);
-    ok = ok && cfl->GetValue("output-dim", &output_dim);
-    // initialize from configuration
-    BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
-    cfl->GetValue("param-stddev", &param_stddev);
-    cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(input_dim, output_dim, patch_dim, patch_step, patch_stride,
-         param_stddev, bias_stddev);
-  }
+
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
+  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
+  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
+  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
+  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
+  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
+  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
+              << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+
+  Check();
 }
 
-// propagation function
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for 3d max pooling, each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
-/* Convolutional propagation is explained:
- - Recall the AffineComponent, input X is defined #frames x $input-dim,
-   linear matrix A is defined $output-dim x $input-dim, and bias
-   vector B is defined by length $output-dim. The propagation is
-   Y = X * A' + B                                     (1)
-   where "*" is row-by-row processing of X, executing vector-matrix
-   multiplication
-   Y(t) = X(t) * A' + B                               (2)
-   which converts each row of input of dim $input-dim to a row of output of
-   dim $output-dim by A' (' defines transpose).
- - In Convolution1dComponent, A is redefined $num-filters x $filter-dim,
-   and bias vector B is redefined by length $num-filters. The propatation is
-   Y = X o A' + B                                     (3)
-   where "o" is also row-by-row processing of X, but executing vector-matrix
-   convolution, which consists of a group of vector-vector convolutions.
-   For instance, the convolution of X(t) and the i-th filter A(i) is
-   Y(t,i) = X(t) o A'(i) + B(i)                       (4)
-   The convolution used here is valid convolution. Meaning that the
-   output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
-
-   Note that in all the equations, B is extended to proper dimensions
-   for legal addition.
-*/
-void Convolutional1dComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  // dims
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = in.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  // column_map is indexed by the column-index of "patches",
-  // and the value is the corresponding column-index of "in".
-  std::vector<int32> column_map(filter_dim * num_patches);
-
-  // build-up a column selection map
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-        for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+        // given the local node coordinate, group them from each pool
+        // to form a patch
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              KALDI_ASSERT(index < column_map_size);
+              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+            }
+          }
+        }
       }
     }
   }
   CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in, cu_cols);
-
-  //
-  // compute filter activations
-  //
-
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch, filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    tgt_batch.push_back(new CuSubMatrix<BaseFloat>(out->ColRange(p * num_filters,
-				    num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(p * filter_dim,
-				    filter_dim)));
-    filter_params_batch.push_back(filter_params_elem);
-
-    tgt_batch[p]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
-  }
-
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans,
-                              filter_params_batch, kTrans, 1.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-}
-
-// scale the parameters
-void Convolutional1dComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
-}
-
-// add another convolution component
-void Convolutional1dComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
+  patches->CopyCols(in, cu_cols);
 }
 
 /*
- This function does an operation similar to reversing a map,
- except it handles maps that are not one-to-one by outputting
- the reversed map as a vector of lists.
- @param[in] forward_indexes is a vector of int32, each of whose
-            elements is between 0 and input_dim - 1.
- @param[in] input_dim. See definitions of forward_indexes and
-            backward_indexes.
- @param[out] backward_indexes is a vector of dimension input_dim
-            of lists, The list at (backward_indexes[i]) is a list
-            of all indexes j such that forward_indexes[j] = i.
+  This is the 3d max pooling propagate function.
+  It is assumed that each row of the input matrix
+  is a vectorized 3D-tensor of type zxy.
+  Similar to the propagate function of ConvolutionComponent,
+  the input matrix is first arranged into patches so that
+  pools (with / without overlapping) could be
+  processed in a parallelizable manner.
+  The output matrix is also a vectorized 3D-tensor of type zxy.
 */
-void Convolutional1dComponent::ReverseIndexes(const std::vector<int32> &forward_indexes,
-                                              int32 input_dim,
-                                              std::vector<std::vector<int32> > *backward_indexes) {
-  int32 i, size = forward_indexes.size();
-  int32 reserve_size = 2 + size / input_dim;
-  backward_indexes->resize(input_dim);
-  std::vector<std::vector<int32> >::iterator iter = backward_indexes->begin(),
-    end = backward_indexes->end();
-  for (; iter != end; ++iter)
-    iter->reserve(reserve_size);
-  for (int32 j = 0; j < forward_indexes.size(); j++) {
-    i = forward_indexes[j];
-    KALDI_ASSERT(i < input_dim);
-    (*backward_indexes)[i].push_back(j);
-  }
-}
-
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void Convolutional1dComponent::RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-
-// back propagation function
-void Convolutional1dComponent::Backprop(const std::string &debug_info,
-                                        const ComponentPrecomputedIndexes *indexes,
-                                        const CuMatrixBase<BaseFloat> &in_value,
-                                        const CuMatrixBase<BaseFloat> &, // out_value,
-                                        const CuMatrixBase<BaseFloat> &out_deriv,
-                                        Component *to_update_in,
-                                        CuMatrixBase<BaseFloat> *in_deriv) const {
-  Convolutional1dComponent *to_update = dynamic_cast<Convolutional1dComponent*>(to_update_in);
-  int32 num_splice = InputDim() / patch_stride_;
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 num_frames = out_deriv.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'patches_',
-   *  1row = vectorized rectangular feature patches,
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
 
-  //
-  // backpropagate to vector of matrices
-  // (corresponding to position of a filter)
-  //
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-	  filter_params_batch;
-
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-		  filter_params_, 0, filter_params_.NumRows(), 0,
-		  filter_params_.NumCols());
-
-  // form batch in vector container
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container. for filter_params_batch, all elements
-    // point to the same copy filter_params_elem
-    patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(patches_deriv.ColRange(
-				    p * filter_dim, filter_dim)));
-    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-				    p * num_filters, num_filters)));
-    filter_params_batch.push_back(filter_params_elem);
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < num_patches; p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-
-  // sum the derivatives into in_deriv
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
-      }
-    }
-  }
-
-  if (in_deriv) {
-    std::vector<std::vector<int32> > reversed_column_map;
-    ReverseIndexes(column_map, InputDim(), &reversed_column_map);
-    std::vector<std::vector<int32> > rearranged_column_map;
-    RearrangeIndexes(reversed_column_map, &rearranged_column_map);
-    for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-      CuArray<int32> cu_cols(rearranged_column_map[p]);
-      in_deriv->AddCols(patches_deriv, cu_cols);
-    }
-  }
-
-  if (to_update != NULL) {
-    // Next update the model (must do this 2nd so the derivatives we propagate
-    // are accurate, in case this == to_update_in.)
-    to_update->Update(debug_info, in_value, out_deriv);
-  }
-}
-
-void Convolutional1dComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    learning_rate_ = 1.0;  // don't call SetLearningRate, that would apply the
-                           // learning rate factor.
-    is_gradient_ = true;
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-}
-
-void Convolutional1dComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<PatchDim>");
-  ReadBasicType(is, binary, &patch_dim_);
-  ExpectToken(is, binary, "<PatchStep>");
-  ReadBasicType(is, binary, &patch_step_);
-  ExpectToken(is, binary, "<PatchStride>");
-  ReadBasicType(is, binary, &patch_stride_);
-  ExpectToken(is, binary, "<FilterParams>");
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, "</Convolutional1dComponent>");
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == "</Convolutional1dComponent>");
-  }
-}
-
-void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
-  WriteToken(os, binary, "<PatchDim>");
-  WriteBasicType(os, binary, patch_dim_);
-  WriteToken(os, binary, "<PatchStep>");
-  WriteBasicType(os, binary, patch_step_);
-  WriteToken(os, binary, "<PatchStride>");
-  WriteBasicType(os, binary, patch_stride_);
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</Convolutional1dComponent>");
-}
-
-BaseFloat Convolutional1dComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const Convolutional1dComponent *other =
-      dynamic_cast<const Convolutional1dComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* Convolutional1dComponent::Copy() const {
-  Convolutional1dComponent *ans = new Convolutional1dComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->patch_dim_ = patch_dim_;
-  ans->patch_step_ = patch_step_;
-  ans->patch_stride_ = patch_stride_;
-  ans->filter_params_ = filter_params_;
-  ans->bias_params_ = bias_params_;
-  ans->is_gradient_ = is_gradient_;
-  return ans;
-}
-
-void Convolutional1dComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
+void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  int32 num_frames = in.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in, &patches);
 
-void Convolutional1dComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                         const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
+  out->Set(-1e20); // reset a large negative value
+  for (int32 q = 0; q < pool_size; q++)
+    out->Max(patches.ColRange(q * num_pools, num_pools));
 }
 
-int32 Convolutional1dComponent::NumParameters() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
 
-// update parameters
-void Convolutional1dComponent::Update(const std::string &debug_info,
-		                      const CuMatrixBase<BaseFloat> &in_value,
-                                      const CuMatrixBase<BaseFloat> &out_deriv) {
-  // useful dims
-  int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
-  int32 num_filters = filter_params_.NumRows();
-  int32 filter_dim = filter_params_.NumCols();
-  int32 num_frames = in_value.NumRows();
-  int32 num_splice = InputDim() / patch_stride_;
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
 
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patches
-   *  1col = dim over speech frames,
-   */
-  CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
-  std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-      for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+              KALDI_ASSERT(vector_index < rev_col_map_size);
+              reverse_column_map[vector_index].push_back(index);
+            }
+          }
+        }
       }
     }
   }
-  CuArray<int32> cu_cols(column_map);
-  patches.CopyCols(in_value, cu_cols);
-
-  //
-  // calculate the gradient
-  //
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  //
-  // use all the patches
-  //
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-		  num_patches * filters_grad.NumRows(), filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, diff_patch_batch,
-	  patch_batch;
-  for (int32 p = 0; p < num_patches; p++) {
-    // form batch in vector container
-    filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-			    filters_grad_blocks_batch.RowRange(
-				    p * filters_grad.NumRows(),
-				    filters_grad.NumRows())));
-    diff_patch_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-				    p * num_filters, num_filters)));
-    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(
-				    p * filter_dim, filter_dim)));
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
-                              kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < num_patches; p++) {
-    delete filters_grad_batch[p];
-    delete diff_patch_batch[p];
-    delete patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void Convolutional1dComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
-  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
-}
-void Convolutional1dComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  KALDI_ASSERT(params.Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
-  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
-}
-
-void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  pool_size_ = pool_size;
-  pool_stride_ = pool_stride;
-
-  // sanity check
-  // number of patches
-  KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
-  int32 num_patches = input_dim_ / pool_stride_;
-  // number of pools
-  KALDI_ASSERT(num_patches % pool_size_ == 0);
-  int32 num_pools = num_patches / pool_size_;
-  // check output dim
-  KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
-}
-
-void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  int32 pool_size = -1, pool_stride = -1;
-  bool ok = true;
-
-  ok = ok && cfl->GetValue("input-dim", &input_dim);
-  ok = ok && cfl->GetValue("output-dim", &output_dim);
-  ok = ok && cfl->GetValue("pool-size", &pool_size);
-  ok = ok && cfl->GetValue("pool-stride", &pool_stride);
-
-  KALDI_LOG << output_dim << " " << input_dim << " " << ok;
-  KALDI_LOG << "Pool: " << pool_size << " "
-            << pool_stride << " " << ok;
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-	      << cfl->UnusedValues();
-  if (!ok || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(input_dim, output_dim, pool_size, pool_stride);
-}
-
-void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-
-  // do the max-pooling
-  for (int32 q = 0; q < num_pools; q++) {
-    // get output buffer of the pool
-    CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
-    pool.Set(-1e20); // reset a large negative value
-    for (int32 r = 0; r < pool_size_; r++) {
-      // col-by-col block comparison pool
-      int32 p = r + q * pool_size_;
-      pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
-    }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
   }
 }
 
+/*
+  3d max pooling backpropagate function
+  This function backpropagate the error from
+  out_deriv to in_deriv.
+  In order to select the node in each pool to
+  backpropagate the error, it has to compare
+  the output pool value stored in the out_value
+  matrix with each of its input pool member node
+  stroed in the in_value matrix.
+*/
 void MaxpoolingComponent::Backprop(const std::string &debug_info,
                                    const ComponentPrecomputedIndexes *indexes,
                                    const CuMatrixBase<BaseFloat> &in_value,
@@ -4198,66 +4027,87 @@ void MaxpoolingComponent::Backprop(const std::string &debug_info,
                                    const CuMatrixBase<BaseFloat> &out_deriv,
                                    Component *, // to_update,
                                    CuMatrixBase<BaseFloat> *in_deriv) const {
-  int32 num_patches = input_dim_ / pool_stride_;
-  int32 num_pools = num_patches / pool_size_;
-  std::vector<int32> patch_summands(num_patches, 0);
-
-  for(int32 q = 0; q < num_pools; q++) {
-    for(int32 r = 0; r < pool_size_; r++) {
-      int32 p = r + q * pool_size_;
-      CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
-      CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-      CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
-      // zero-out mask
-      CuMatrix<BaseFloat> mask;
-      in_p.EqualElementMask(out_q, &mask);
-      src.MulElements(mask);
-      tgt.AddMat(1.0, src);
-      // summed deriv info
-      patch_summands[p] += 1;
-    }
-  }
+  if (!in_deriv)
+    return;
 
-  // scale in_deriv of overlaped pools
-  for(int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
-    KALDI_ASSERT(patch_summands[p] > 0);
-    tgt.Scale(1.0 / patch_summands[p]);
+  int32 num_frames = in_value.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in_value, &patches);
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
   }
+
+  // combine the derivatives from the individual input deriv patches
+  // to compute input deriv matrix
+  InderivPatchesToInderiv(patches, in_deriv);
 }
 
 void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "<PoolSize>");
-  ReadBasicType(is, binary, &pool_size_);
-  ExpectToken(is, binary, "<PoolStride>");
-  ReadBasicType(is, binary, &pool_stride_);
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<PoolXSize>");
+  ReadBasicType(is, binary, &pool_x_size_);
+  ExpectToken(is, binary, "<PoolYSize>");
+  ReadBasicType(is, binary, &pool_y_size_);
+  ExpectToken(is, binary, "<PoolZSize>");
+  ReadBasicType(is, binary, &pool_z_size_);
+  ExpectToken(is, binary, "<PoolXStep>");
+  ReadBasicType(is, binary, &pool_x_step_);
+  ExpectToken(is, binary, "<PoolYStep>");
+  ReadBasicType(is, binary, &pool_y_step_);
+  ExpectToken(is, binary, "<PoolZStep>");
+  ReadBasicType(is, binary, &pool_z_step_);
   ExpectToken(is, binary, "</MaxpoolingComponent>");
+  Check();
 }
 
 void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "<PoolSize>");
-  WriteBasicType(os, binary, pool_size_);
-  WriteToken(os, binary, "<PoolStride>");
-  WriteBasicType(os, binary, pool_stride_);
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<PoolXSize>");
+  WriteBasicType(os, binary, pool_x_size_);
+  WriteToken(os, binary, "<PoolYSize>");
+  WriteBasicType(os, binary, pool_y_size_);
+  WriteToken(os, binary, "<PoolZSize>");
+  WriteBasicType(os, binary, pool_z_size_);
+  WriteToken(os, binary, "<PoolXStep>");
+  WriteBasicType(os, binary, pool_x_step_);
+  WriteToken(os, binary, "<PoolYStep>");
+  WriteBasicType(os, binary, pool_y_step_);
+  WriteToken(os, binary, "<PoolZStep>");
+  WriteBasicType(os, binary, pool_z_step_);
   WriteToken(os, binary, "</MaxpoolingComponent>");
 }
 
+// display information about component
 std::string MaxpoolingComponent::Info() const {
   std::ostringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_
-         << ", pool-size = " << pool_size_
-         << ", pool-stride = " << pool_stride_;
+  stream << Type()
+         << ", input-x-dim = " << input_x_dim_
+         << ", input-y-dim = " << input_y_dim_
+         << ", input-z-dim = " << input_z_dim_
+         << ", pool-x-size = " << pool_x_size_
+         << ", pool-y-size = " << pool_y_size_
+         << ", pool-z-size = " << pool_z_size_
+         << ", pool-x-step = " << pool_x_step_
+         << ", pool-y-step = " << pool_y_step_
+         << ", pool-z-step = " << pool_z_step_;
   return stream.str();
 }
 
@@ -4402,18 +4252,31 @@ int32 CompositeComponent::Properties() const {
   // get the activations at intermediate layers, if these were not needed in
   // backprop, there would be no reason to use a CompositeComponent.
   int32 ans = kSimpleComponent | kBackpropNeedsInput |
-      (last_component_properties & kPropagateAdds) |
-      (last_component_properties & kBackpropNeedsOutput) |
-      (first_component_properties & kBackpropAdds) |
-      (IsUpdatable() ? kUpdatableComponent : 0);
-  // we call StoreStats() on any sub-components as part of
-  // the backprop phase.
+      (last_component_properties &
+       (kPropagateAdds|kBackpropNeedsOutput|kOutputContiguous)) |
+       (first_component_properties &
+        (kBackpropAdds|kInputContiguous)) |
+       (IsUpdatable() ? kUpdatableComponent : 0);
+  // note, we don't return the kStoresStats property because that function is
+  // not implemented; instead, for efficiency, we call StoreStats() on any
+  // sub-components as part of the backprop phase.
   if (last_component_properties & kStoresStats)
     ans |= kBackpropNeedsOutput;
   return ans;
 };
 
 
+MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
+  int32 num_components = components_.size();
+  if ((components_[i]->Properties() & kOutputContiguous) ||
+      (i + 1 < num_components &&
+       (components_[i + 1]->Properties() & kInputContiguous)))
+    return kStrideEqualNumCols;
+  else
+    return kDefaultStride;
+}
+
+
 // virtual
 void CompositeComponent::Propagate(
     const ComponentPrecomputedIndexes *, // indexes
@@ -4440,10 +4303,11 @@ void CompositeComponent::Propagate(
   std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components - 1);
   for (int32 i = 0; i < num_components; i++) {
     if (i + 1 < num_components) {
+      MatrixResizeType resize_type =
+          ((components_[i]->Properties() & kPropagateAdds) ?
+           kSetZero : kUndefined);
       intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
-                                     kUndefined);
-      if (components_[i]->Properties() & kPropagateAdds)
-        intermediate_outputs[i].SetZero();
+                                     resize_type, GetStrideType(i));
     }
     components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i-1]),
                (i + 1 == num_components ? out : &(intermediate_outputs[i])));
@@ -4609,10 +4473,11 @@ void CompositeComponent::Backprop(const std::string &debug_info,
         !(components_[i+1]->Properties() & kBackpropNeedsInput) &&
         !(components_[i]->Properties() & kBackpropNeedsOutput))
       break;
+    MatrixResizeType resize_type =
+        ((components_[i]->Properties() & kPropagateAdds) ?
+         kSetZero : kUndefined);
     intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
-                                   kUndefined);
-    if (components_[i]->Properties() & kPropagateAdds)
-      intermediate_outputs[i].SetZero();
+                                   resize_type, GetStrideType(i));
     components_[i]->Propagate(NULL,
                               (i == 0 ? in_value : intermediate_outputs[i-1]),
                               &(intermediate_outputs[i]));
@@ -4633,9 +4498,11 @@ void CompositeComponent::Backprop(const std::string &debug_info,
         in_deriv == NULL)
       break;
     if (i > 0) {
-      intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim());
-      if (components_[i]->Properties() & kPropagateAdds)
-        intermediate_derivs[i-1].SetZero();
+      MatrixResizeType resize_type =
+          ((components_[i]->Properties() & kBackpropAdds) ?
+           kSetZero : kUndefined);
+      intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
+                                      resize_type, GetStrideType(i - 1));
     }
     components_[i]->Backprop(debug_info, NULL,
                              (i == 0 ? in_value : intermediate_outputs[i-1]),
@@ -4699,11 +4566,10 @@ void CompositeComponent::PerturbParams(BaseFloat stddev) {
   }
 }
 
-// virtual
-void CompositeComponent::SetLearningRate(BaseFloat lrate) {
+void CompositeComponent::SetUnderlyingLearningRate(BaseFloat lrate) {
   KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
-  UpdatableComponent::SetLearningRate(lrate);  // set learning_rate_-- this gets
-                                               // returned from LearningRate().
+  UpdatableComponent::SetUnderlyingLearningRate(lrate);
+
   // apply any learning-rate-factor that's set at this level (ill-advised, but
   // we'll do it.)
   BaseFloat effective_lrate = LearningRate();
@@ -4711,7 +4577,19 @@ void CompositeComponent::SetLearningRate(BaseFloat lrate) {
     if (components_[i]->Properties() & kUpdatableComponent) {
       UpdatableComponent *uc =
           dynamic_cast<UpdatableComponent*>(components_[i]);
-      uc->SetLearningRate(effective_lrate);
+      uc->SetUnderlyingLearningRate(effective_lrate);
+    }
+  }
+}
+
+void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  UpdatableComponent::SetActualLearningRate(lrate);
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetActualLearningRate(lrate);
     }
   }
 }
@@ -4826,6 +4704,15 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
                 << "(or undefined or bad component type [type=xxx]), in "
                 << "CompositeComponent config line '" << cfl->WholeLine() << "'";
     }
+    if(this_component->Type() == "CompositeComponent") {
+      DeletePointers(&components);
+      delete this_component;
+      KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
+                << "Try decreasing max-rows-process instead."
+                << "Nested line: '" << nested_line.WholeLine() << "'\n"
+                << "Toplevel CompositeComponent line '" << cfl->WholeLine()
+                << "'";
+    }
     this_component->InitFromConfig(&nested_line);
     components.push_back(this_component);
   }
@@ -4835,7 +4722,16 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
   this->Init(components, max_rows_process);
 }
 
+const Component* CompositeComponent::GetComponent(int32 i) const {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  return components_[i];
+}
 
+void CompositeComponent::SetComponent(int32 i, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  delete components_[i];
+  components_[i] = component;
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index a78f72c0afb..bc8c6300320 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -6,6 +6,7 @@
 //           2014-2015  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
 //                2015  Daniel Galvez
+//                2015  Tom Ko
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -116,19 +117,22 @@ class ElementwiseProductComponent: public Component {
   int32 output_dim_;
 };
 
-class NormalizeComponent: public NonlinearComponent {
-  // note: although we inherit from NonlinearComponent, we don't actually bohter
-  // accumulating the stats that NonlinearComponent is capable of accumulating.
+class NormalizeComponent: public Component {
  public:
- void Init(int32 dim, BaseFloat target_rms);
-  explicit NormalizeComponent(int32 dim, BaseFloat target_rms = 1.0) { Init(dim, target_rms); }
-  explicit NormalizeComponent(const NormalizeComponent &other): NonlinearComponent(other),
-    target_rms_(other.target_rms_) { }
+ void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
+  explicit NormalizeComponent(int32 input_dim,
+                              BaseFloat target_rms = 1.0,
+                              bool add_log_stddev = false) {
+    Init(input_dim, target_rms, add_log_stddev);
+  }
+  explicit NormalizeComponent(const NormalizeComponent &other);
   virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
-        kBackpropInPlace;
+    return (add_log_stddev_ ?
+            kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
+            kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
+            kBackpropAdds|kBackpropInPlace);
   }
-  NormalizeComponent(): target_rms_(1.0) { }
+  NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
   virtual std::string Type() const { return "NormalizeComponent"; }
   virtual void InitFromConfig(ConfigLine *cfl);
   virtual Component* Copy() const { return new NormalizeComponent(*this); }
@@ -143,26 +147,30 @@ class NormalizeComponent: public NonlinearComponent {
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
 
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
+  virtual void Read(std::istream &is, bool binary);
   virtual void Write(std::ostream &os, bool binary) const;
-
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return (input_dim_ + (add_log_stddev_ ? 1 : 0));
+  }
   virtual std::string Info() const;
  private:
   NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  static const BaseFloat kNormFloor;
+  enum { kExpSquaredNormFloor = -66 };
+  static const BaseFloat kSquaredNormFloor;
+  int32 input_dim_;
   BaseFloat target_rms_; // The target rms for outputs.
   // about 0.7e-20.  We need a value that's exactly representable in
   // float and whose inverse square root is also exactly representable
   // in float (hence, an even power of two).
+
+  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
+                        // is an extra dimension of the output.
 };
 
 
 class SigmoidComponent: public NonlinearComponent {
  public:
-  explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
   explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
   SigmoidComponent() { }
   virtual std::string Type() const { return "SigmoidComponent"; }
@@ -182,12 +190,16 @@ class SigmoidComponent: public NonlinearComponent {
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 };
 
 class TanhComponent: public NonlinearComponent {
  public:
-  explicit TanhComponent(int32 dim): NonlinearComponent(dim) { }
   explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
   TanhComponent() { }
   virtual std::string Type() const { return "TanhComponent"; }
@@ -207,14 +219,19 @@ class TanhComponent: public NonlinearComponent {
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 };
 
 
 class RectifiedLinearComponent: public NonlinearComponent {
  public:
-  explicit RectifiedLinearComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other): NonlinearComponent(other) { }
+  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
+      NonlinearComponent(other) { }
   RectifiedLinearComponent() { }
   virtual std::string Type() const { return "RectifiedLinearComponent"; }
   virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
@@ -233,7 +250,12 @@ class RectifiedLinearComponent: public NonlinearComponent {
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
+
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv) const;
+
   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 };
 
@@ -389,6 +411,79 @@ class AffineComponent: public UpdatableComponent {
   CuVector<BaseFloat> bias_params_;
 };
 
+class RepeatedAffineComponent;
+
+/// This class implements an affine transform using a block diagonal matrix
+/// e.g., one whose weight matrix is all zeros except for blocks on the
+/// diagonal. All these blocks have the same dimensions.
+///  input-dim: num cols of block diagonal matrix.
+///  output-dim: num rows of block diagonal matrix.
+/// num-blocks: number of blocks in diagonal of the matrix.
+/// num-blocks must divide both input-dim and output-dim
+class BlockAffineComponent : public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
+  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  BlockAffineComponent() { }
+  virtual std::string Type() const { return "BlockAffineComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
+      kBackpropNeedsInput|kBackpropAdds;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // BlockAffine-specific functions.
+  void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
+            BaseFloat param_stddev, BaseFloat bias_mean,
+            BaseFloat bias_stddev);
+  explicit BlockAffineComponent(const BlockAffineComponent &other);
+  explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
+ protected:
+  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
+  // equal size.  The blocks are stored in linear_params_ as
+  // [ M
+  //   N
+  //   O ] but we actually treat it as the matrix:
+  // [ M 0 0
+  //   0 N 0
+  //   0 0 O ]
+  CuMatrix<BaseFloat> linear_params_;
+  CuVector<BaseFloat> bias_params_;
+  int32 num_blocks_;
+ private:
+  const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
+};
+
 class RepeatedAffineComponent: public UpdatableComponent {
  public:
 
@@ -402,7 +497,7 @@ class RepeatedAffineComponent: public UpdatableComponent {
   virtual std::string Type() const { return "RepeatedAffineComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
-	     kBackpropNeedsInput|kBackpropAdds;
+        kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
   }
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
                          const CuMatrixBase<BaseFloat> &in,
@@ -438,7 +533,7 @@ class RepeatedAffineComponent: public UpdatableComponent {
   void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
             BaseFloat param_stddev, BaseFloat bias_mean,
             BaseFloat bias_stddev);
-
+  friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
  protected:
   // This function Update(), called from backprop, is broken out for
   // extensibility to natural gradient update.
@@ -492,80 +587,8 @@ class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
   OnlineNaturalGradient preconditioner_in_;
 };
 
-
-/// This class implements an affine transform using a block diagonal matrix
-/// e.g., one whose weight matrix is all zeros except for blocks on the
-/// diagonal. All these blocks have the same dimensions.
-///  input-dim: num cols of block diagonal matrix.
-///  output-dim: num rows of block diagonal matrix.
-/// num-blocks: number of blocks in diagonal of the matrix.
-/// num-blocks must divide both input-dim and output-dim
-class BlockAffineComponent : public UpdatableComponent {
- public:
-  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-
-  BlockAffineComponent() { }
-  virtual std::string Type() const { return "BlockAffineComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
-      kBackpropNeedsInput|kBackpropAdds;
-  }
-
-  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // BlockAffine-specific functions.
-  void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
-            BaseFloat param_stddev, BaseFloat bias_mean,
-            BaseFloat bias_stddev);
-  explicit BlockAffineComponent(const BlockAffineComponent &other);
- protected:
-  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
-  // equal size.  The blocks are stored in linear_params_ as
-  // [ M
-  //   N
-  //   O ] but we actually treat it as the matrix:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-  int32 num_blocks_;
- private:
-  const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
-};
-
 class SoftmaxComponent: public NonlinearComponent {
  public:
-  explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
   explicit SoftmaxComponent(const SoftmaxComponent &other):
       NonlinearComponent(other) { }
   SoftmaxComponent() { }
@@ -592,7 +615,6 @@ class SoftmaxComponent: public NonlinearComponent {
 
 class LogSoftmaxComponent: public NonlinearComponent {
  public:
-  explicit LogSoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
   explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
       NonlinearComponent(other) { }
   LogSoftmaxComponent() { }
@@ -886,7 +908,6 @@ class FixedBiasComponent: public Component {
 // very often, but it may sometimes make your life easier
 class NoOpComponent: public NonlinearComponent {
  public:
-  explicit NoOpComponent(int32 dim): NonlinearComponent(dim) { }
   explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
   NoOpComponent() { }
   virtual std::string Type() const { return "NoOpComponent"; }
@@ -1177,6 +1198,75 @@ class PerElementOffsetComponent: public UpdatableComponent {
 };
 
 
+// ConstantFunctionComponent returns constant function of its input,
+// i.e. its output does not depend on its input.  It is the same as
+// an affine component with the linear term fixed at zero.
+// It is optionally trainable, and optionally you can use natural
+// gradient.  The input is required only because the framework
+// requires components to have an input.
+class ConstantFunctionComponent: public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_.Dim(); }
+
+  virtual std::string Info() const;
+  // possible parameter values with their defaults:
+  // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
+  // output-mean=0 output-stddev=0
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  ConstantFunctionComponent();
+
+  ConstantFunctionComponent(const ConstantFunctionComponent &other);
+
+  virtual std::string Type() const { return "ConstantFunctionComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|
+        (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
+        (InputDim() == OutputDim() ? kPropagateInPlace|kBackpropInPlace: 0) |
+        kBackpropAdds;
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+ private:
+  int32 input_dim_;
+  // the output value-- a vector.
+  CuVector<BaseFloat> output_;
+
+  bool is_updatable_;
+  // if true, and if updatable, do natural-gradient update.
+  bool use_natural_gradient_;
+  OnlineNaturalGradient preconditioner_;
+
+  const ConstantFunctionComponent &operator
+  = (const ConstantFunctionComponent &other); // Disallow.
+};
+
+
+
 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
 // it uses a natural gradient update for the per-element scales, and enforces a
 // maximum amount of change per minibatch, for stability.
@@ -1444,162 +1534,58 @@ class ConvolutionComponent: public UpdatableComponent {
   const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
 };
 
-/**
- * Convolutional1dComponent implements convolution over frequency axis.
- * We assume the input featrues are spliced, i.e. each frame is in
- * fact a set of stacked frames, where we can form patches which span
- * over several frequency bands and whole time axis. A patch is the
- * instance of a filter on a group of frequency bands and whole time
- * axis. Shifts of the filter generate patches.
+
+/*
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an
+ * representative activation in an area. It inspired Maxout nonlinearity.
+ * Each output element of this component is the maximum of a block of
+ * input elements where the block has a 3D dimension (pool_x_size_,
+ * pool_y_size_, pool_z_size_).
+ * Blocks could overlap if the shift value on any axis is smaller
+ * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
+ * If the shift values are euqal to their pool size, there is no
+ * overlap; while if they all equal 1, the blocks overlap to
+ * the greatest possible extent.
  *
- * The convolution is done over whole axis with same filter
- * coefficients, i.e. we don't use separate filters for different
- * 'regions' of frequency axis. Due to convolution, same weights are
- * used repeateadly, the final gradient is a sum of all
- * position-specific gradients (the sum was found better than
- * averaging).
+ * This component is designed to be used after a ConvolutionComponent
+ * so that the input matrix is propagated from a 2d-convolutional layer.
+ * This component implements 3d-maxpooling which performs
+ * max pooling along the three axes.
+ * Input : A matrix where each row is a vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like number of filters in the ConvolutionComponent)
  *
- * In order to have a fast implementations, the filters are
- * represented in vectorized form, where each rectangular filter
- * corresponds to a row in a matrix, where all the filters are
- * stored. The features are then re-shaped to a set of matrices, where
- * one matrix corresponds to single patch-position, where all the
- * filters get applied.
+ *        The component assumes input vectorizations of type zyx
+ *        which is the default output vectorization type of a ConvolutionComponent.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
  *
- * The type of convolution is controled by hyperparameters:
- * patch_dim_     ... frequency axis size of the patch
- * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch
- *                    (i.e. frame length before splicing)
- * For instance, for a convolutional component after raw input,
- * if the input is 36-dim fbank feature with delta of order 2
- * and spliced using +/- 5 frames of contexts, the convolutional
- * component takes the input as a 36 x 33 image. The patch_stride_
- * should be configured 36. If patch_step_ and patch_dim_ are
- * configured 1 and 7, the Convolutional1dComponent creates a
- * 2D filter of 7 x 33, such that the convolution is actually done
- * only along the frequency axis. Specifically, the convolutional
- * output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
- * the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
- * resulting in an output image of 30 x 1, which is called a feature map
- * in ConvNet. Then if the output-dim is set 3840, the constructor
- * would know there should be 3840 / 30 = 128 distinct filters,
- * which will create 128 feature maps of 30 x 1 for one frame of
- * input. The feature maps are vectorized as a 3840-dim row vector
- * in the output matrix of this component. For details on progatation
- * of Convolutional1dComponent, check the function definition.
  *
  */
-class Convolutional1dComponent: public UpdatableComponent {
+
+class MaxpoolingComponent: public Component {
  public:
-  Convolutional1dComponent();
+
+  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
+                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
   // constructor using another component
-  Convolutional1dComponent(const Convolutional1dComponent &component);
-  // constructor using parameters
-  Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
-                           const CuVectorBase<BaseFloat> &bias_params,
-                           BaseFloat learning_rate);
+  MaxpoolingComponent(const MaxpoolingComponent &component);
 
   virtual int32 InputDim() const;
   virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "Convolutional1dComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-	    kBackpropAdds|kPropagateAdds;
-  }
-
-  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Some functions that are specific to this class.
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
-  void Init(int32 input_dim, int32 output_dim,
-            int32 patch_dim, int32 patch_step, int32 patch_stride,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(int32 patch_dim, int32 patch_step, int32 patch_stride,
-            std::string matrix_filename);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-
-  void Update(const std::string &debug_info,
-	      const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
- private:
-  int32 patch_dim_;
-  int32 patch_step_;
-  int32 patch_stride_;
-
-  static void ReverseIndexes(const std::vector<int32> &forward_indexes,
-                             int32 input_dim,
-                             std::vector<std::vector<int32> > *backward_indexes);
-  static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                               std::vector<std::vector<int32> > *out);
-
-  const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
-  CuMatrix<BaseFloat> filter_params_;
-  CuVector<BaseFloat> bias_params_;
-  bool is_gradient_;
-};
-
-/**
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an representative
- * activation in an area. It inspired Maxout nonlinearity.
- *
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * For instance, a minibatch of 512 frames is propagated by a convolutional
- * layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
- * which is composed of 128 feature maps for each frame (128 x 30). If you want
- * a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
- * as 128 and 3 respectively. Maxpooling component would create an output
- * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
- * the maximum in a group is selected, creating a smaller feature map of 10.
- *
- * Our pooling does not supports overlaps, which simplifies the
- * implementation (and was not helpful for Ossama).
- */
-class MaxpoolingComponent: public Component {
- public:
-  explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
-                               int32 pool_size, int32 pool_stride) {
-    Init(input_dim, output_dim, pool_size, pool_stride);
-  }
-  MaxpoolingComponent(): input_dim_(0), output_dim_(0),
-    pool_size_(0), pool_stride_(0) { }
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
+  virtual void Check() const;
 
   virtual std::string Info() const;
   virtual void InitFromConfig(ConfigLine *cfl);
@@ -1625,25 +1611,37 @@ class MaxpoolingComponent: public Component {
 
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
-  // We don't implement InitFromConfig() at this level: child-class should do
-  // it.
-  virtual Component* Copy() const {
-    return new MaxpoolingComponent(input_dim_, output_dim_,
-		    pool_size_, pool_stride_); }
+  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
 
-  // Some functions that are specific to this class
-  void Init(int32 input_dim, int32 output_dim,
-            int32 pool_size, int32 pool_stride);
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
 
  protected:
-  int32 input_dim_;
-  int32 output_dim_;
-  int32 pool_size_;
-  int32 pool_stride_;
+  int32 input_x_dim_;   // size of the input along x-axis
+  // (e.g. number of time steps)
+  int32 input_y_dim_;   // size of input along y-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_z_dim_;   // size of input along z-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_x_size_;    // size of the pooling window along x-axis
+  int32 pool_y_size_;    // size of the pooling window along y-axis
+  int32 pool_z_size_;    // size of the pooling window along z-axis
+
+  int32 pool_x_step_;   // the number of steps taken along x-axis of input
+  //  before computing the next pool
+  int32 pool_y_step_;   // the number of steps taken along y-axis of input
+  // before computing the next pool
+  int32 pool_z_step_;   // the number of steps taken along z-axis of input
+  // before computing the next pool
+
 };
 
+
 /**
-   CompositeComponent is components representing a sequence of
+   CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
    (imagine this is all on one line):
 
@@ -1659,6 +1657,10 @@ class MaxpoolingComponent: public Component {
    much memory for very long (and you can make the memory usage very small by
    making max-rows-process small).  We inherit from UpdatableComponent just in
    case one or more of the components in the sequence are updatable.
+
+   It is an error to nest a CompositeComponent inside a CompositeComponent.
+   The same effect can be accomplished by specifying a smaller max-rows-process
+   in a single CompositeComponent.
  */
 class CompositeComponent: public UpdatableComponent {
  public:
@@ -1709,7 +1711,8 @@ class CompositeComponent: public UpdatableComponent {
   // Don't implement Copy() at this level: implement it in the child class.
 
   // Some functions from base-class UpdatableComponent.
-  virtual void SetLearningRate(BaseFloat lrate);
+  virtual void SetUnderlyingLearningRate(BaseFloat lrate);
+  virtual void SetActualLearningRate(BaseFloat lrate);
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void SetZero(bool treat_as_gradient);
@@ -1724,8 +1727,24 @@ class CompositeComponent: public UpdatableComponent {
   // want to store stats, as part of the backprop pass.  This is not 100% ideal
   // but it will usually do what you want.  We can revisit this later if needed.
 
+  // Functions to iterate over the internal components
+
+  int32 NumComponents() const { return components_.size();}
+  /// Gets the ith component in this component.
+  /// The ordering is the same as in the config line. The caller
+  /// does not own the received component.
+  const Component* GetComponent(int32 i) const;
+  /// Sets the ith component. After this call, CompositeComponent owns
+  /// the reference to the argument component. Frees the previous
+  /// ith component.
+  void SetComponent(int32 i, Component *component);
+
   virtual ~CompositeComponent() { DeletePointers(&components_); }
- protected:
+ private:
+  // returns the stride type, kDefaultStride or kStrideEqualNumCols,
+  // at the output of the i'th component.
+  inline MatrixStrideType GetStrideType(int32 i) const;
+
   // returns true if at least one of 'components_' returns the kUpdatable flag
   // in its flags.
   bool IsUpdatable() const;
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 97720a0f2c0..933808dc61c 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -2,6 +2,7 @@
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 // Copyright      2015  Vijayaditya Peddinti
+// Copyright      2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -162,6 +163,49 @@ void GenerateConfigSequenceSimple(
 }
 
 
+void GenerateConfigSequenceStatistics(
+    const NnetGenerationOptions &opts,
+    std::vector<std::string> *configs) {
+  int32 input_dim = RandInt(10, 30),
+      input_period = RandInt(1, 3),
+      stats_period = input_period * RandInt(1, 3),
+      left_context = stats_period * RandInt(1, 10),
+      right_context = stats_period * RandInt(1, 10),
+      log_count_features = RandInt(0, 3);
+  BaseFloat variance_floor = RandInt(1, 10) * 1.0e-10;
+  bool output_stddevs = (RandInt(0, 1) == 0);
+
+  int32 raw_stats_dim = 1 + input_dim + (output_stddevs ? input_dim : 0),
+      pooled_stats_dim = log_count_features + input_dim +
+        (output_stddevs ? input_dim : 0);
+  std::ostringstream os;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component name=statistics-extraction type=StatisticsExtractionComponent "
+     << "input-dim=" << input_dim << " input-period=" << input_period
+     << " output-period=" << stats_period << " include-variance="
+     << std::boolalpha << output_stddevs << "\n";
+
+  os << "component name=statistics-pooling type=StatisticsPoolingComponent "
+     << "input-dim=" << raw_stats_dim << " input-period=" << stats_period
+     << " left-context=" << left_context << " right-context=" << right_context
+     << " num-log-count-features=" << log_count_features << " output-stddevs="
+     << std::boolalpha << output_stddevs << " variance-floor="
+     << variance_floor << "\n";
+
+  os << "component name=affine type=AffineComponent "
+     << "input-dim=" << input_dim << " output-dim=" << pooled_stats_dim
+     << "\n";
+
+  os << "component-node name=statistics-extraction component=statistics-extraction "
+     << "input=input\n";
+  os << "component-node name=statistics-pooling component=statistics-pooling "
+     << "input=statistics-extraction\n";
+  os << "component-node name=affine component=affine input=input\n";
+  os << "output-node name=output input=Sum(affine, Round(statistics-pooling, "
+     << stats_period << "))\n";
+  configs->push_back(os.str());
+}
+
 // This generates a single config corresponding to an RNN.
 void GenerateConfigSequenceRnn(
     const NnetGenerationOptions &opts,
@@ -614,38 +658,6 @@ void GenerateConfigSequenceCnn(
     std::vector<std::string> *configs) {
   std::ostringstream os;
 
-  int32 pool_stride = 5 + Rand() % 10, pool_size = 2 + Rand() % 3,
-	num_pools = 1 + Rand() % 10;
-  int32 num_patches = num_pools * pool_size;
-  int32 patch_step = 1 + Rand() % 4, patch_dim = 4 + Rand () % 5,
-	patch_stride = (num_patches - 1) * patch_step + patch_dim;
-  int32 num_splice = 5 + Rand() % 10, num_filters = pool_stride;
-
-  int32 input_dim = patch_stride * num_splice,
-	hidden_dim = num_patches * num_filters,
-        output_dim = num_pools * pool_stride;
-
-  os << "component name=conv type=Convolutional1dComponent input-dim="
-     << input_dim << " output-dim=" << hidden_dim
-     << " patch-dim=" << patch_dim << " patch-step=" << patch_step
-     << " patch-stride=" << patch_stride << std::endl;
-  os << "component name=maxpooling type=MaxpoolingComponent input-dim="
-     << hidden_dim << " output-dim=" << output_dim
-     << " pool-size=" << pool_size << " pool-stride=" << pool_stride
-     << std::endl;
-
-  os << "input-node name=input dim=" << input_dim << std::endl;
-  os << "component-node name=conv_node component=conv input=input\n";
-  os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n";
-  os << "output-node name=output input=maxpooling_node\n";
-  configs->push_back(os.str());
-}
-
-void GenerateConfigSequenceCnn2d(
-    const NnetGenerationOptions &opts,
-    std::vector<std::string> *configs) {
-  std::ostringstream os;
-
 
   int32 input_x_dim = 10 + Rand() % 20,
         input_y_dim = 10 + Rand() % 20,
@@ -682,8 +694,40 @@ void GenerateConfigSequenceCnn2d(
      << " input-vectorization-order=" << vectorization
      << std::endl;
 
+  int32 conv_output_x_dim = (1 + (input_x_dim - filt_x_dim) / filt_x_step);
+  int32 conv_output_y_dim = (1 + (input_y_dim - filt_y_dim) / filt_y_step);
+  int32 conv_output_z_dim = num_filters;
+  int32 pool_x_size = 1 + Rand() % conv_output_x_dim;
+  int32 pool_y_size = 1 + Rand() % conv_output_y_dim;
+  int32 pool_z_size = 1 + Rand() % conv_output_z_dim;
+  int32 pool_x_step = 1;
+  int32 pool_y_step = 1;
+  int32 pool_z_step = 1;
+  do {
+    pool_x_step = (1 + Rand() % pool_x_size);
+  } while((conv_output_x_dim - pool_x_size) % pool_x_step);
+  do {
+    pool_y_step = (1 + Rand() % pool_y_size);
+  } while((conv_output_y_dim - pool_y_size) % pool_y_step);
+  do {
+    pool_z_step = (1 + Rand() % pool_z_size);
+  } while((conv_output_z_dim - pool_z_size) % pool_z_step);
+
+  os << "component name=maxpooling type=MaxpoolingComponent "
+     << " input-x-dim=" << conv_output_x_dim
+     << " input-y-dim=" << conv_output_y_dim
+     << " input-z-dim=" << conv_output_z_dim
+     << " pool-x-size=" << pool_x_size
+     << " pool-y-size=" << pool_y_size
+     << " pool-z-size=" << pool_z_size
+     << " pool-x-step=" << pool_x_step
+     << " pool-y-step=" << pool_y_step
+     << " pool-z-step=" << pool_z_step
+     << std::endl;
+
   os << "input-node name=input dim=" << (input_x_dim * input_y_dim * input_z_dim) << std::endl;
   os << "component-node name=conv_node component=conv input=input\n";
+  os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n";
   os << "output-node name=output input=conv_node\n";
   configs->push_back(os.str());
 }
@@ -712,13 +756,53 @@ void GenerateConfigSequenceDistribute(
   configs->push_back(os.str());
 }
 
-
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs) {
+  int32 num_components = RandInt(1,5);
+  int32 input_dim = 10 * RandInt(1,10);
+  if (opts.output_dim > 0) {
+    KALDI_WARN  << "This function doesn't take a requested output_dim due to "
+      "implementation complications.";
+  }
+  int32 max_rows_process = 512 + 512 * RandInt(1,3);
+  std::ostringstream os;
+  os << "component name=composite1 type=CompositeComponent max-rows-process="
+     << max_rows_process << " num-components=" << num_components;
+
+  int32 types_length = 3;
+  std::string types[] = {"BlockAffineComponent",
+                         "RepeatedAffineComponent",
+                         "NaturalGradientRepeatedAffineComponent"};
+  int32 last_output_dim = input_dim;
+  // components within a composite component are indexed from 1.
+  for(int32 i = 1; i <= num_components; i++) {
+    os << " component" << i << "=";
+    int32 rand_index = RandInt(0, types_length - 1);
+    std::string rand_type = types[rand_index];
+    os << "'type=" << rand_type << " input-dim=" << last_output_dim;
+    int32 current_output_dim = 10 * RandInt(1,10);
+    // must be a divisor or current_output_dim and last_output_dim
+    int32 num_repeats = 10;
+    os << " output-dim=" << current_output_dim;
+    std::string repeats_string = (rand_type == "BlockAffineComponent") ? "num-blocks": "num-repeats";
+    os << " " << repeats_string << "=" << num_repeats << "'";
+    last_output_dim = current_output_dim;
+  }
+  os << std::endl << std::endl;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component-node name=composite1 component=composite1 input=input\n";
+  os << "output-node name=output input=composite1\n";
+  configs->push_back(os.str());
+}
 
 void GenerateConfigSequence(
     const NnetGenerationOptions &opts,
     std::vector<std::string> *configs) {
 start:
-  int32 network_type = RandInt(0, 9);
+  int32 network_type = RandInt(0, 10);
   switch(network_type) {
     case 0:
       GenerateConfigSequenceSimplest(opts, configs);
@@ -763,16 +847,18 @@ void GenerateConfigSequence(
       GenerateConfigSequenceCnn(opts, configs);
       break;
     case 8:
-      if (!opts.allow_nonlinearity)
-        goto start;
-      GenerateConfigSequenceCnn2d(opts, configs);
+      GenerateConfigSequenceDistribute(opts, configs);
       break;
     case 9:
-      GenerateConfigSequenceDistribute(opts, configs);
+      GenerateConfigSequenceCompositeBlock(opts, configs);
+      break;
+    case 10:
+      GenerateConfigSequenceStatistics(opts, configs);
       break;
     default:
       KALDI_ERR << "Error generating config sequence.";
   }
+  KALDI_ASSERT(!configs->empty());
 }
 
 void ComputeExampleComputationRequestSimple(
@@ -792,6 +878,11 @@ void ComputeExampleComputationRequestSimple(
       input_end_frame = output_end_frame + right_context + (Rand() % 3),
       n_offset = Rand() % 2;
   bool need_deriv = (Rand() % 2 == 0);
+  // make sure there are at least 3 frames of input available.  this makes a
+  // difference for our tests of statistics-pooling and statistics-extraction
+  // component.
+  if (input_end_frame < input_start_frame + 3)
+    input_end_frame = input_start_frame + 3;
 
   request->inputs.clear();
   request->outputs.clear();
@@ -834,6 +925,7 @@ void ComputeExampleComputationRequestSimple(
 
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
+
   int32 n = RandInt(0, 27);
   BaseFloat learning_rate = 0.001 * RandInt(1, 3);
 
@@ -848,9 +940,13 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     }
     case 1: {
       BaseFloat target_rms = (RandInt(1, 200) / 100.0);
+      std::string add_log_stddev = (Rand() % 2 == 0 ? "True" : "False");
       *component_type = "NormalizeComponent";
-      os << "dim=" << RandInt(1, 50)
-         << " target-rms=" << target_rms;
+      // avoid dim=1 because the derivatives would be zero, which
+      // makes them hard to test.
+      os << "dim=" << RandInt(2, 50)
+         << " target-rms=" << target_rms
+         << " add-log-stddev=" << add_log_stddev;
       break;
     }
     case 2: {
@@ -945,25 +1041,6 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       break;
     }
     case 17: {
-      *component_type = "Convolutional1dComponent";
-      int32 patch_stride = 10 + Rand() % 50, patch_step = 1 + Rand() % 4,
-	    patch_dim = 4 + Rand () % 5;
-
-      // decrease patch_stride so that
-      // (patch_stride - patch_dim) % patch_step == 0
-      patch_stride = patch_stride - ((patch_stride - patch_dim) % patch_step);
-
-      int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
-      int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
-      int32 input_dim = patch_stride * num_splice;
-      int32 output_dim = num_patches * num_filters;
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim
-         << " patch-dim=" << patch_dim << " patch-step=" << patch_step
-         << " patch-stride=" << patch_stride
-         << " learning-rate=" << learning_rate;
-      break;
-    }
-    case 18: {
       int32 input_vectorization = Rand() % 2;
       std::string vectorization;
       if (input_vectorization == 0) {
@@ -1001,19 +1078,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       // TODO : add test for file based initialization. But confirm how to write
       // a file which is not going to be overwritten by other components
     }
-    case 19: {
-      *component_type = "MaxpoolingComponent";
-      int32 pool_stride = 5 + Rand() % 10,
-      pool_size = 2 + Rand() % 3,
-      num_pools = 1 + Rand() % 10;
-      int32 output_dim = num_pools * pool_stride;
-      int32 num_patches = num_pools * pool_size;
-      int32 input_dim = pool_stride * num_patches;
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim
-         << " pool-size=" << pool_size << " pool-stride=" << pool_stride;
-      break;
-    }
-    case 20: {
+    case 18: {
       *component_type = "PermuteComponent";
       int32 input_dim = 10 + Rand() % 100;
       std::vector<int32> column_map(input_dim);
@@ -1027,7 +1092,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       os << "column-map=" << buffer.str();
       break;
     }
-    case 21: {
+    case 19: {
       *component_type = "PerElementOffsetComponent";
       std::string param_config = RandInt(0, 1)?
                                  " param-mean=0.0 param-stddev=0.0":
@@ -1036,14 +1101,14 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " learning-rate=" << learning_rate << param_config;
       break;
     }
-    case 22: {
+    case 20: {
       *component_type = "SumReduceComponent";
       int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15),
           input_dim = output_dim * group_size;
       os << "input-dim=" << input_dim << " output-dim=" << output_dim;
       break;
     }
-    case 23: {
+    case 21: {
       *component_type = "CompositeComponent";
       int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3),
           max_rows_process = RandInt(1, 30);
@@ -1066,14 +1131,14 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       }
       break;
     }
-    case 24: {
+    case 22: {
       *component_type = "SumGroupComponent";
       int32 num_groups = RandInt(1, 50),
         input_dim = num_groups * RandInt(1, 15);
       os << "input-dim=" << input_dim << " output-dim=" << num_groups;
       break;
     }
-    case 25: {
+    case 23: {
       *component_type = "RepeatedAffineComponent";
       int32 num_repeats = RandInt(1, 50),
           input_dim = num_repeats * RandInt(1, 15),
@@ -1082,7 +1147,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-repeats=" << num_repeats;
       break;
     }
-    case 26: {
+    case 24: {
       *component_type = "BlockAffineComponent";
       int32 num_blocks = RandInt(1, 50),
           input_dim = num_blocks * RandInt(1, 15),
@@ -1091,7 +1156,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-blocks=" << num_blocks;
       break;
     }
-    case 27: {
+    case 25: {
       *component_type = "NaturalGradientRepeatedAffineComponent";
       int32 num_repeats = RandInt(1, 50),
           input_dim = num_repeats * RandInt(1, 15),
@@ -1100,6 +1165,46 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " num-repeats=" << num_repeats;
       break;
     }
+    case 26: {
+      *component_type = "MaxpoolingComponent";
+      int32 input_x_dim = 5 + Rand() % 10,
+            input_y_dim = 5 + Rand() % 10,
+            input_z_dim = 5 + Rand() % 10;
+      int32 pool_x_size = 1 + Rand() % input_x_dim,
+            pool_y_size = 1 + Rand() % input_y_dim,
+            pool_z_size = 1 + Rand() % input_z_dim;
+      int32 pool_x_step = (1 + Rand() % pool_x_size),
+            pool_y_step = (1 + Rand() % pool_y_size),
+            pool_z_step = (1 + Rand() % pool_z_size);
+      // adjusting input dim to ensure divisibility
+      int32 remainder = (input_x_dim - pool_x_size) % pool_x_step;
+      input_x_dim = input_x_dim - remainder;
+      remainder = (input_y_dim - pool_y_size) % pool_y_step;
+      input_y_dim = input_y_dim - remainder;
+      remainder = (input_z_dim - pool_z_size) % pool_z_step;
+      input_z_dim = input_z_dim - remainder;
+      os << " input-x-dim=" << input_x_dim
+         << " input-y-dim=" << input_y_dim
+         << " input-z-dim=" << input_z_dim
+         << " pool-x-size=" << pool_x_size
+         << " pool-y-size=" << pool_y_size
+         << " pool-z-size=" << pool_z_size
+         << " pool-x-step=" << pool_x_step
+         << " pool-y-step=" << pool_y_step
+         << " pool-z-step=" << pool_z_step;
+      break;
+    }
+    case 27: {
+      *component_type = "ConstantFunctionComponent";
+      int32 input_dim = RandInt(1, 50), output_dim = RandInt(1, 50);
+      bool is_updatable = (RandInt(0, 1) == 0),
+          use_natural_gradient =  (RandInt(0, 1) == 0);
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " learning-rate=" << learning_rate
+         << " is-updatable=" << std::boolalpha << is_updatable
+         << " use-natural-gradient=" << std::boolalpha << use_natural_gradient;
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index 25d4c21de41..18e4960f9bd 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-test-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+// Copyright   2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -59,11 +59,20 @@ struct NnetGenerationOptions {
 void GenerateConfigSequence(const NnetGenerationOptions &opts,
                             std::vector<std::string> *configs);
 
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs);
 
 /**  This function computes an example computation request, for testing purposes.
      The "Simple" in the name means that it currently only supports neural nets
      that satisfy IsSimple(nnet) (defined in nnet-utils.h).
-     If there are 2 inputs, the "input" will be first, followed by "ivector". */
+     If there are 2 inputs, the "input" will be first, followed by "ivector".
+
+     In order to expand the range of things you can test with this, we guarantee
+     that there will always be at least 3 successive frames of input available.
+*/
 void ComputeExampleComputationRequestSimple(
     const Nnet &nnet,
     ComputationRequest *request,
@@ -87,7 +96,7 @@ bool NnetParametersAreIdentical(const Nnet &nnet1,
     ivector_dim <= 0).  This function generates exactly "left_context" or
     "right_context" frames of context on the left and right respectively. */
 void GenerateSimpleNnetTrainingExample(
-    int32 num_supervised_frames,    
+    int32 num_supervised_frames,
     int32 left_context,
     int32 right_context,
     int32 input_dim,
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 3efc82b6cf2..ae5d883b029 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-training.cc
 
 // Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2015    Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -41,6 +42,16 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                                // natural-gradient updates.
     SetZero(is_gradient, delta_nnet_);
   }
+  if (config_.read_cache != "") {
+    bool binary;
+    try {
+      Input ki(config_.read_cache, &binary);
+      compiler_.ReadCache(ki.Stream(), binary);
+    } catch (...) {
+      KALDI_WARN << "Could not open cached computation. "
+                    "Probably this is the first training iteration.";
+    }
+  } 
 }
 
 
@@ -82,6 +93,10 @@ void NnetTrainer::Train(const NnetExample &eg) {
     AddNnet(*delta_nnet_, scale, nnet_);
     ScaleNnet(config_.momentum, delta_nnet_);
   }
+  if (config_.write_cache != "") {
+    Output ko(config_.write_cache, config_.binary_write_cache);
+    compiler_.WriteCache(ko.Stream(), config_.binary_write_cache);
+  } 
 }
 
 void NnetTrainer::ProcessOutputs(const NnetExample &eg,
@@ -124,7 +139,8 @@ void ObjectiveFunctionInfo::UpdateStats(
     int32 minibatches_per_phase,
     int32 minibatch_counter,
     BaseFloat this_minibatch_weight,
-    BaseFloat this_minibatch_tot_objf) {
+    BaseFloat this_minibatch_tot_objf,
+    BaseFloat this_minibatch_tot_aux_objf) {
   int32 phase = minibatch_counter / minibatches_per_phase;
   if (phase != current_phase) {
     KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
@@ -132,11 +148,14 @@ void ObjectiveFunctionInfo::UpdateStats(
     current_phase = phase;
     tot_weight_this_phase = 0.0;
     tot_objf_this_phase = 0.0;
+    tot_aux_objf_this_phase = 0.0;
   }
   tot_weight_this_phase += this_minibatch_weight;
   tot_objf_this_phase += this_minibatch_tot_objf;
+  tot_aux_objf_this_phase += this_minibatch_tot_aux_objf;
   tot_weight += this_minibatch_weight;
   tot_objf += this_minibatch_tot_objf;
+  tot_aux_objf += this_minibatch_tot_aux_objf;
 }
 
 void ObjectiveFunctionInfo::PrintStatsForThisPhase(
@@ -144,19 +163,40 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase(
     int32 minibatches_per_phase) const {
   int32 start_minibatch = current_phase * minibatches_per_phase,
       end_minibatch = start_minibatch + minibatches_per_phase - 1;
-  KALDI_LOG << "Average objective function for '" << output_name
-            << "' for minibatches " << start_minibatch
-            << '-' << end_minibatch << " is "
-            << (tot_objf_this_phase / tot_weight_this_phase) << " over "
-            << tot_weight_this_phase << " frames.";
+
+  if (tot_aux_objf_this_phase == 0.0) {
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << (tot_objf_this_phase / tot_weight_this_phase) << " over "
+              << tot_weight_this_phase << " frames.";
+  } else {
+    BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase),
+        aux_objf = (tot_aux_objf_this_phase / tot_weight_this_phase),
+        sum_objf = objf + aux_objf;
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << objf << " + " << aux_objf << " = " << sum_objf
+              << " over " << tot_weight_this_phase << " frames.";
+  }
 }
 
 bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const {
-  KALDI_LOG << "Overall average objective function for '" << name << "' is "
-            << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  BaseFloat objf = (tot_objf / tot_weight),
+        aux_objf = (tot_aux_objf / tot_weight),
+        sum_objf = objf + aux_objf;
+  if (tot_aux_objf == 0.0) {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  } else {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << objf << " + " << aux_objf << " = " << sum_objf        
+              << " over " << tot_weight << " frames.";
+  }
   KALDI_LOG << "[this line is to be parsed by a script:] "
             << "log-prob-per-frame="
-            << (tot_objf / tot_weight);
+            << objf;
   return (tot_weight != 0.0);
 }
 
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index b5ab61bb47c..1d998f6cf3b 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-training.h
 
 // Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+//              2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -35,6 +36,9 @@ struct NnetTrainerOptions {
   int32 print_interval;
   bool debug_computation;
   BaseFloat momentum;
+  std::string read_cache;
+  std::string write_cache;
+  bool binary_write_cache;
   BaseFloat max_param_change;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
@@ -64,6 +68,12 @@ struct NnetTrainerOptions {
                    "so that the 'effective' learning rate is the same as "
                    "before (because momentum would normally increase the "
                    "effective learning rate by 1/(1-momentum))");
+    opts->Register("read-cache", &read_cache, "the location where we can read "
+                   "the cached computation from");
+    opts->Register("write-cache", &write_cache, "the location where we want to "
+                   "write the cached computation to");
+    opts->Register("binary-write-cache", &binary_write_cache, "Write "
+                   "computation cache in binary mode");
 
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
@@ -84,14 +94,19 @@ struct ObjectiveFunctionInfo {
 
   double tot_weight;
   double tot_objf;
+  double tot_aux_objf;  // An 'auxiliary' objective function that is optional-
+                        // may be used when things like regularization are being
+                        // used.
 
   double tot_weight_this_phase;
   double tot_objf_this_phase;
+  double tot_aux_objf_this_phase;
 
   ObjectiveFunctionInfo():
       current_phase(0),
-      tot_weight(0.0), tot_objf(0.0),
-      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0) { }
+      tot_weight(0.0), tot_objf(0.0), tot_aux_objf(0.0),
+      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0),
+      tot_aux_objf_this_phase(0.0) { }
 
   // This function updates the stats and, if the phase has just changed,
   // prints a message indicating progress.  The phase equals
@@ -101,7 +116,8 @@ struct ObjectiveFunctionInfo {
                    int32 minibatches_per_phase,
                    int32 minibatch_counter,
                    BaseFloat this_minibatch_weight,
-                   BaseFloat this_minibatch_tot_objf);
+                   BaseFloat this_minibatch_tot_objf,
+                   BaseFloat this_minibatch_tot_aux_objf = 0.0);
 
   // Prints stats for the current phase.
   void PrintStatsForThisPhase(const std::string &output_name,
diff --git a/src/nnet3/nnet-utils-test.cc b/src/nnet3/nnet-utils-test.cc
index 62cb240a3c0..ef1588044b2 100644
--- a/src/nnet3/nnet-utils-test.cc
+++ b/src/nnet3/nnet-utils-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-utils-test.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+//           2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -45,6 +47,59 @@ void UnitTestNnetContext() {
   }
 }
 
+void UnitTestConvertRepeatedToBlockAffine() {
+  // a test without a composite component.
+  std::string config =
+    "component name=repeated-affine1 type=RepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=20\n"
+    "component name=relu1 type=RectifiedLinearComponent dim=200\n"
+    "component name=block-affine1 type=BlockAffineComponent "
+    "input-dim=200 output-dim=100 num-blocks=10\n"
+    "component name=relu2 type=RectifiedLinearComponent dim=100\n"
+    "component name=repeated-affine2 type=NaturalGradientRepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=10\n"
+    "\n"
+    "input-node name=input dim=100\n"
+    "component-node name=repeated-affine1 component=repeated-affine1 input=input\n"
+    "component-node name=relu1 component=relu1 input=repeated-affine1\n"
+    "component-node name=block-affine1 component=block-affine1 input=relu1\n"
+    "component-node name=relu2 component=relu2 component=relu2 input=block-affine1\n"
+    "component-node name=repeated-affine2 component=repeated-affine2 input=relu2\n"
+    "output-node name=output input=repeated-affine2\n";
+
+  Nnet nnet;
+  std::istringstream is(config);
+  nnet.ReadConfig(is);
+  ConvertRepeatedToBlockAffine(&nnet);
+
+  for(int i = 0; i < nnet.NumComponents(); i++) {
+    Component *c = nnet.GetComponent(i);
+    KALDI_ASSERT(c->Type() != "RepeatedAffineComponent"
+                 && c->Type() != "NaturalGradientRepeatedAffineComponent");
+  }
+}
+
+void UnitTestConvertRepeatedToBlockAffineComposite() {
+  // test that repeated affine components nested within a CompositeComponent
+  // are converted.
+  struct NnetGenerationOptions gen_config;
+  gen_config.output_dim = 0;
+  std::vector<std::string> configs;
+  // this function generates a neural net with one component:
+  // a composite component.
+  GenerateConfigSequenceCompositeBlock(gen_config, &configs);
+  Nnet nnet;
+  std::istringstream is(configs[0]);
+  nnet.ReadConfig(is);
+  KALDI_ASSERT(nnet.NumComponents() == 1);
+  ConvertRepeatedToBlockAffine(&nnet);
+  CompositeComponent *cc = dynamic_cast<CompositeComponent*>(nnet.GetComponent(0));
+  for(int i = 0; i < cc->NumComponents(); i++) {
+    const Component *c = cc->GetComponent(i);
+    KALDI_ASSERT(c->Type() == "BlockAffineComponent");
+  }
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -54,6 +109,8 @@ int main() {
   SetVerboseLevel(2);
 
   UnitTestNnetContext();
+  UnitTestConvertRepeatedToBlockAffine();
+  UnitTestConvertRepeatedToBlockAffineComposite();
 
   KALDI_LOG << "Nnet tests succeeded.";
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 8b2eb787f84..3315bd1d31f 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1,7 +1,8 @@
 // nnet3/nnet-utils.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+//                2016  Daniel Galvez
+//
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-simple-component.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -40,10 +42,8 @@ int32 NumInputNodes(const Nnet &nnet) {
 
 
 bool IsSimpleNnet(const Nnet &nnet) {
-  // check that we have just one output node and it is
-  // called "output".
-  if (NumOutputNodes(nnet) != 1 ||
-      nnet.GetNodeIndex("output") == -1 ||
+  // check that we have an output node and called "output".
+  if (nnet.GetNodeIndex("output") == -1 ||
       !nnet.IsOutputNode(nnet.GetNodeIndex("output")))
     return false;
   // check that there is an input node named "input".
@@ -248,6 +248,22 @@ void ZeroComponentStats(Nnet *nnet) {
   }
 }
 
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                     Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      uc->SetActualLearningRate(uc->LearningRate() * learning_rate_scale);
+    }
+  }
+}
+
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -259,7 +275,58 @@ void SetLearningRate(BaseFloat learning_rate,
       if (uc == NULL)
         KALDI_ERR << "Updatable component does not inherit from class "
             "UpdatableComponent; change this code.";
-      uc->SetLearningRate(learning_rate);
+      uc->SetUnderlyingLearningRate(learning_rate);
+    }
+  }
+}
+
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                     Nnet *nnet) {
+  for (int32 c = 0, i = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < learning_rates.Dim());
+      uc->SetActualLearningRate(learning_rates(i));
+    }
+  }
+}
+
+void GetLearningRates(const Nnet &nnet, 
+                      Vector<BaseFloat> *learning_rates) {
+  learning_rates->Resize(NumUpdatableComponents(nnet));
+  for (int32 c = 0, i = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      (*learning_rates)(i++) = uc->LearningRate();
+    }
+  }
+}
+
+void ScaleNnetComponents(const Vector<BaseFloat> &scale_factors,
+                         Nnet *nnet) {
+  for (int32 c = 0, i = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < scale_factors.Dim());
+      uc->Scale(scale_factors(i));
     }
   }
 }
@@ -359,6 +426,51 @@ int32 NumUpdatableComponents(const Nnet &dest) {
   return ans;
 }
 
+void ConvertRepeatedToBlockAffine(CompositeComponent *c_component) {
+  for(int32 i = 0; i < c_component->NumComponents(); i++) {
+    const Component *c = c_component->GetComponent(i);
+    KALDI_ASSERT(c->Type() != "CompositeComponent" &&
+                 "Nesting CompositeComponent within CompositeComponent is not allowed.\n"
+                 "(We may change this as more complicated components are introduced.)");
+
+    if(c->Type() == "RepeatedAffineComponent" ||
+       c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      c_component->SetComponent(i, bac);
+    }
+  }
+}
+
+void ConvertRepeatedToBlockAffine(Nnet *nnet) {
+  for(int32 i = 0; i < nnet->NumComponents(); i++) {
+    const Component *const_c = nnet->GetComponent(i);
+    if(const_c->Type() == "RepeatedAffineComponent" ||
+       const_c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(const_c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      nnet->SetComponent(i, bac);
+    } else if (const_c->Type() == "CompositeComponent") {
+      // We must modify the composite component, so we use the
+      // non-const GetComponent() call here.
+      Component *c = nnet->GetComponent(i);
+      CompositeComponent *cc = dynamic_cast<CompositeComponent*>(c);
+      KALDI_ASSERT(cc != NULL);
+      ConvertRepeatedToBlockAffine(cc);
+    }
+  }
+}
+
 std::string NnetInfo(const Nnet &nnet) {
   std::ostringstream ostr;
   if (IsSimpleNnet(nnet)) {
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 9241d43f54d..9b869aa7933 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -85,7 +85,8 @@ std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
                                              const VectorBase<BaseFloat> &vec);
 
 /// This function returns true if the nnet has the following properties:
-///  It has one output, called "output".
+///  It has an called "output" (other outputs are allowed but may be
+///          ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.
 ///  There are probably some other properties that we really ought to
@@ -109,15 +110,39 @@ void ComputeSimpleNnetContext(const Nnet &nnet,
                               int32 *right_context);
 
 
-/// Sets the learning rate for all the components in the nnet to this value.
+/// Sets the underlying learning rate for all the components in the nnet to this
+/// value.  this will get multiplied by the individual learning-rate-factors to
+/// produce the actual learning rates.
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet);
 
+/// Scales the actual learning rate for all the components in the nnet
+/// by this factor
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                       Nnet *nnet);
+
+/// Sets the actual learning rates for all the updatable components in the
+/// neural net to the values in 'learning_rates' vector
+/// (one for each updatable component).
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                      Nnet *nnet);
+
+/// Get the learning rates for all the updatable components in the neural net 
+/// (the output must have dim equal to the number of updatable components).
+void GetLearningRates(const Nnet &nnet,
+                      Vector<BaseFloat> *learning_rates);
+
 /// Scales the nnet parameters and stats by this scale.
 void ScaleNnet(BaseFloat scale, Nnet *nnet);
+  
+/// Scales the parameters of each of the updatable components.
+/// Here, scales is a vector of size equal to the number of updatable
+/// components
+void ScaleNnetComponents(const Vector<BaseFloat> &scales,
+                         Nnet *nnet);
 
 /// Does *dest += alpha * src (affects nnet parameters and
-///  stored stats).
+/// stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
 
 /// Returns the total of the number of parameters in the updatable components of
@@ -138,6 +163,9 @@ void UnVectorizeNnet(const VectorBase<BaseFloat> &params,
 /// Returns the number of updatable components in the nnet.
 int32 NumUpdatableComponents(const Nnet &dest);
 
+/// Convert all components of type RepeatedAffineComponent or
+/// NaturalGradientRepeatedAffineComponent to BlockAffineComponent in nnet.
+void ConvertRepeatedToBlockAffine(Nnet *nnet);
 
 /// This function returns various info about the neural net.
 /// If the nnet satisfied IsSimpleNnet(nnet), the info includes "left-context=5\nright-context=3\n...".  The info includes
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 0a57c17fad0..9c267c91685 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -12,7 +12,11 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
    nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
    nnet3-copy nnet3-show-progress nnet3-align-compiled \
-   nnet3-get-egs-dense-targets nnet3-compute
+   nnet3-get-egs-dense-targets nnet3-compute nnet3-modify-learning-rates \
+	 nnet3-discriminative-get-egs nnet3-discriminative-copy-egs \
+	 nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
+	 nnet3-discriminative-compute-objf nnet3-discriminative-train \
+	 discriminative-get-supervision
 
 OBJFILES =
 
@@ -21,7 +25,7 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../chain/kaldi-chain.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
          ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
diff --git a/src/nnet3bin/discriminative-get-supervision.cc b/src/nnet3bin/discriminative-get-supervision.cc
new file mode 100644
index 00000000000..32d66c1c55a
--- /dev/null
+++ b/src/nnet3bin/discriminative-get-supervision.cc
@@ -0,0 +1,100 @@
+// nnet3bin/discriminative-get-supervision.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/discriminative-supervision.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::discriminative;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get a discriminative training supervision object for each file of training data.\n"
+        "This will normally be piped into nnet3-discriminative-get-egs, where it\n"
+        "will be split up into pieces and combined with the features.\n"
+        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
+        "<den-lattice-rspecifier> <supervision-wspecifier>\n";
+
+    DiscriminativeSupervisionOptions sup_opts;
+
+    ParseOptions po(usage);
+
+    sup_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string num_ali_rspecifier = po.GetArg(1),
+                den_lat_rspecifier = po.GetArg(2),
+                supervision_wspecifier = po.GetArg(3);
+
+    DiscriminativeSupervisionWriter supervision_writer(supervision_wspecifier);
+    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
+    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
+
+    int32 num_utts_done = 0, num_utts_error = 0;
+
+    for (; !ali_reader.Done(); ali_reader.Next())  {
+      const std::string &key = ali_reader.Key();
+      const std::vector<int32> &num_ali = ali_reader.Value();
+      
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find denominator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      const Lattice &den_lat = den_lat_reader.Value(key);
+
+      DiscriminativeSupervision supervision;
+
+      if (!supervision.Initialize(num_ali, den_lat, 1.0)) {
+        KALDI_WARN << "Failed to convert lattice to supervision "
+          << "for utterance " << key;
+        num_utts_error++;
+        continue;
+      }
+
+      supervision_writer.Write(key, supervision);
+      
+      num_utts_done++;
+    } 
+    
+    KALDI_LOG << "Generated discriminative supervision information for "
+              << num_utts_done << " utterances, errors on "
+              << num_utts_error;
+    return (num_utts_done > num_utts_error ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index f04f5f61215..61d822edd82 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -1,8 +1,9 @@
 // nnet2bin/nnet-align-compiled.cc
 
-// Copyright 2009-2012  Microsoft Corporation
-//                      Johns Hopkins University (author: Daniel Povey)
-//                2015  Vijayaditya Peddinti
+// Copyright 2009-2012     Microsoft Corporation
+//                         Johns Hopkins University (author: Daniel Povey)
+//                2015     Vijayaditya Peddinti
+//                2015-16  Vimal Manohar
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -41,7 +42,8 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Align features given nnet3 neural net model\n"
-        "Usage:   nnet3-align-compiled [options] <nnet-in> <graphs-rspecifier> <features-rspecifier> <alignments-wspecifier>\n"
+        "Usage:   nnet3-align-compiled [options] <nnet-in> <graphs-rspecifier> "
+        "<features-rspecifier> <alignments-wspecifier>\n"
         "e.g.: \n"
         " nnet3-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
@@ -81,6 +83,8 @@ int main(int argc, char *argv[]) {
     po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
                 "between iVectors in matrices supplied to the --online-ivectors "
                 "option");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 4 || po.NumArgs() > 5) {
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 1a66615b430..dd38288418e 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -1,6 +1,7 @@
 // nnet3bin/nnet3-am-copy.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2016 Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -46,7 +47,9 @@ int main(int argc, char *argv[]) {
     bool binary_write = true,
         raw = false;
     BaseFloat learning_rate = -1;
+    BaseFloat learning_rate_scale = 1;
     std::string set_raw_nnet = "";
+    bool convert_repeated_to_block = false;
     BaseFloat scale = 1.0;
 
     ParseOptions po(usage);
@@ -57,9 +60,16 @@ int main(int argc, char *argv[]) {
                 "Set the raw nnet inside the model to the one provided in "
                 "the option string (interpreted as an rxfilename).  Done "
                 "before the learning-rate is changed.");
+    po.Register("convert-repeated-to-block", &convert_repeated_to_block,
+                "Convert all RepeatedAffineComponents and "
+                "NaturalGradientRepeatedAffineComponents to "
+                "BlockAffineComponents in the model. Done after set-raw-nnet.");
     po.Register("learning-rate", &learning_rate,
                 "If supplied, all the learning rates of updatable components"
                 " are set to this value.");
+    po.Register("learning-rate-scale", &learning_rate_scale,
+                "Scales the learning rate of updatable components by this "
+                "factor");
     po.Register("scale", &scale, "The parameter matrices are scaled"
                 " by the specified value.");
 
@@ -89,8 +99,16 @@ int main(int argc, char *argv[]) {
       am_nnet.SetNnet(nnet);
     }
 
+    if(convert_repeated_to_block)
+      ConvertRepeatedToBlockAffine(&(am_nnet.GetNnet()));
+
     if (learning_rate >= 0)
       SetLearningRate(learning_rate, &(am_nnet.GetNnet()));
+    
+    KALDI_ASSERT(learning_rate_scale >= 0.0);
+
+    if (learning_rate_scale != 1.0)
+      ScaleLearningRate(learning_rate_scale, &(am_nnet.GetNnet()));
 
     if (scale != 1.0)
       ScaleNnet(scale, &(am_nnet.GetNnet()));
diff --git a/src/nnet3bin/nnet3-discriminative-compute-objf.cc b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
new file mode 100644
index 00000000000..555272a2048
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
@@ -0,0 +1,94 @@
+// nnet3bin/nnet3-discriminative-compute-objf.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Computes and prints to in logging messages the objective function per frame of\n"
+        "the given data with an nnet3 neural net.  The input of this is the output of\n"
+        "e.g. nnet3-discriminative-get-egs | nnet3-discriminative-merge-egs.\n"
+        "\n"
+        "Usage:  nnet3-discrminative-compute-objf [options] <nnet3-model-in> <training-examples-in>\n"
+        "e.g.: nnet3-discriminative-compute-objf 0.mdl ark:valid.degs\n";
+
+
+    // This program doesn't support using a GPU, because these probabilities are
+    // used for diagnostics, and you can just compute them with a small enough
+    // amount of data that a CPU can do it within reasonable time.
+    // It wouldn't be hard to make it support GPU, though.
+
+    NnetComputeProbOptions nnet_opts;
+    discriminative::DiscriminativeOptions discriminative_opts;
+
+    ParseOptions po(usage);
+
+    nnet_opts.Register(&po);
+    discriminative_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    NnetDiscriminativeComputeObjf discriminative_objf_computer(nnet_opts, 
+                                              discriminative_opts, 
+                                              tmodel, am_nnet.Priors(), 
+                                              am_nnet.GetNnet());
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      discriminative_objf_computer.Compute(example_reader.Value());
+
+    bool ok = discriminative_objf_computer.PrintTotalStats();
+
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
new file mode 100644
index 00000000000..831484ebb11
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -0,0 +1,139 @@
+// nnet3bin/nnet3-discriminative-copy-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples for nnet3 discriminative training, possibly changing the binary mode.\n"
+        "Supports multiple wspecifiers, in which case it will write the examples\n"
+        "round-robin to the outputs.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark,t:text.degs\n"
+        "or:\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark:1.degs ark:2.degs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<NnetDiscriminativeExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new NnetDiscriminativeExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        const NnetDiscriminativeExample &eg = example_reader.Value();
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        NnetDiscriminativeExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
new file mode 100644
index 00000000000..786ed609a33
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -0,0 +1,358 @@
+// nnet3bin/nnet3-discriminative-get-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example-utils.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/**
+   This function does all the processing for one utterance, and outputs the
+   supervision objects to 'example_writer'.  
+*/
+
+static bool ProcessFile(
+                        const discriminative::SplitDiscriminativeSupervisionOptions &config,
+                        const TransitionModel &tmodel,
+                        const MatrixBase<BaseFloat> &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        const discriminative::DiscriminativeSupervision &supervision,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 left_context,
+                        int32 right_context,
+                        int32 frames_per_eg,
+                        int32 frames_overlap_per_eg,
+                        int32 frame_subsampling_factor,
+                        int64 *num_frames_written,
+                        int64 *num_egs_written,
+                        NnetDiscriminativeExampleWriter *example_writer) {
+  KALDI_ASSERT(supervision.num_sequences == 1);
+  int32 num_feature_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence,
+      num_feature_frames_subsampled =
+                             (num_feature_frames + frame_subsampling_factor - 1)/
+                             frame_subsampling_factor;
+  if (num_output_frames != num_feature_frames_subsampled)
+    KALDI_ERR << "Mismatch in num-frames: discriminative supervision has "
+              << num_output_frames
+              << " versus features/frame_subsampling_factor = "
+              << num_feature_frames << " / " << frame_subsampling_factor
+              << ": check that --frame-subsampling-factor option is set "
+              << "the same as to discriminative-get-supervision.";
+
+  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+
+  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
+      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
+      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+
+  if (frames_per_eg != -1 && num_feature_frames_subsampled < frames_per_eg_subsampled) {
+    KALDI_WARN << "No output for utterance " << utt_id
+               << " (num-frames=" << num_feature_frames
+               << ") because too short for --frames-per-eg="
+               << frames_per_eg;
+    return false;
+  }
+
+  // we don't do any padding, as it would be a bit tricky to pad the discriminative training supervision.
+  // Instead we select ranges of frames that fully fit within the file;  these
+  // might slightly overlap with each other or have gaps.
+  std::vector<int32> range_starts_subsampled;
+  if (frames_per_eg != -1) {
+    chain::SplitIntoRanges(num_feature_frames_subsampled -
+                           frames_overlap_subsampled,
+                           frames_shift_subsampled,
+                           &range_starts_subsampled);
+  } else {
+    range_starts_subsampled.push_back(0);
+  }
+  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
+  // that we tend to avoid having nonzero weights on the derivatives that are
+  // too close to the edge of the corresponding 'range' (these derivatives close
+  // to the edge are not as accurate as they could be, because when we split we
+  // don't know the correct alphas and betas).
+  std::vector<Vector<BaseFloat> > deriv_weights;
+  if (frames_per_eg != -1) {
+    chain::GetWeightsForRanges(frames_per_eg_subsampled,
+                        range_starts_subsampled,
+                        &deriv_weights);
+
+    if (range_starts_subsampled.empty()) {
+      KALDI_WARN << "No output for utterance " << utt_id
+                 << " (num-frames=" << num_feature_frames
+                 << ") because too short for --frames-per-eg="
+                 << frames_per_eg;
+      return false;
+    }
+  } else {
+    deriv_weights.push_back(Vector<BaseFloat>());
+  }
+
+  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel, 
+                                                             supervision);
+
+  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+
+    NnetDiscriminativeExample nnet_discriminative_eg;
+    nnet_discriminative_eg.outputs.resize(1);
+    int32 range_start_subsampled = range_starts_subsampled[i],
+        range_start = range_start_subsampled * frame_subsampling_factor;
+    
+    if (frames_per_eg != -1) {
+
+      discriminative::DiscriminativeSupervision supervision_part;
+
+      splitter.GetFrameRange(range_start_subsampled,
+                             frames_per_eg_subsampled,
+                             (i == 0 ? false : true),
+                             &supervision_part);
+
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    } else {
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    }
+
+    nnet_discriminative_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
+
+    int32 this_frames_per_eg = frames_per_eg != -1 ? frames_per_eg : supervision.frames_per_sequence;
+
+    int32 tot_frames = left_context + this_frames_per_eg + right_context;
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+
+    // Set up "input_frames".
+    for (int32 j = -left_context; j < this_frames_per_eg + right_context; j++) {
+      int32 t = range_start + j;
+      if (t < 0) t = 0;
+      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
+      SubVector<BaseFloat> src(feats, t),
+          dest(input_frames, j + left_context);
+      dest.CopyFromVec(src);
+    }
+    NnetIo input_io("input", - left_context,
+                    input_frames);
+    nnet_discriminative_eg.inputs[0].Swap(&input_io);
+
+    if (ivector_feats != NULL) {
+      // if applicable, add the iVector feature.
+      // try to get closest frame to middle of window to get
+      // a representative iVector.
+      int32 closest_frame = range_start + this_frames_per_eg / 2;
+      KALDI_ASSERT(ivector_feats->NumRows() > 0);
+      if (closest_frame >= ivector_feats->NumRows())
+        closest_frame = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      NnetIo ivector_io("ivector", 0, ivector);
+      nnet_discriminative_eg.inputs[1].Swap(&ivector_io);
+    }
+
+    if (compress)
+      nnet_discriminative_eg.Compress();
+
+    std::ostringstream os;
+    os << utt_id << "-" << range_start;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    *num_frames_written += this_frames_per_eg;
+    *num_egs_written += 1;
+
+    example_writer->Write(key, nnet_discriminative_eg);
+  }
+  return true;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3+sequence neural network\n"
+        "training.  This involves breaking up utterances into pieces of a\n"
+        "fixed size.  Input will come from discriminative-get-supervision.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-get-egs [options] <model> <features-rspecifier> "
+        "<discriminative-supervision-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "discriminative-get-supervision [args] | \\\n"
+        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=20 \\\n"
+        "  \"$feats\" ark,s,cs:- ark:degs.1.ark\n"
+        "Note: the --frame-subsampling-factor option must be the same as given to\n"
+        "discriminative-get-supervision.\n";
+
+    bool compress = true;
+    int32 left_context = 0, right_context = 0, num_frames = 1,
+        num_frames_overlap = 0, length_tolerance = 100,
+        frame_subsampling_factor = 1;
+
+    std::string ivector_rspecifier;
+    discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format (recommended)");
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
+    po.Register("num-frames", &num_frames, "Number of frames with labels "
+                "that each example contains.  Will be rounded up to a multiple "
+                "of --frame-subsampling-factor.");
+    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                "overlap between each example (could be useful in conjunction "
+                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
+                "Each time we shift by --num-frames minus --num-frames-overlap.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+                "features, as a matrix.");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                "if the frame-rate at the output will be less than the "
+                "frame-rate of the input");
+    
+    ParseOptions splitter_opts("supervision-splitter", &po);
+    splitter_config.Register(&splitter_opts);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (left_context < 0 || right_context < 0 ||
+        length_tolerance < 0 || frame_subsampling_factor <= 0)
+      KALDI_ERR << "One of the integer options is out of the allowed range.";
+
+    if (frame_subsampling_factor != 1)
+      RoundUpNumFrames(frame_subsampling_factor,
+                       &num_frames, &num_frames_overlap);
+
+    std::string model_wxfilename, feature_rspecifier,
+                supervision_rspecifier,
+                examples_wspecifier;
+
+    model_wxfilename = po.GetArg(1);
+    feature_rspecifier = po.GetArg(2);
+    supervision_rspecifier = po.GetArg(3);
+    examples_wspecifier = po.GetArg(4);
+
+    TransitionModel tmodel;
+    { 
+      bool binary;
+      Input ki(model_wxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+    }
+
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
+        supervision_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    int64 num_frames_written = 0, num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      if (!supervision_reader.HasKey(key)) {
+        KALDI_WARN << "No supervision for key " << key;
+        num_err++;
+      } else {
+        const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
+        const Matrix<BaseFloat> *ivector_feats = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            ivector_feats = &(ivector_reader.Value(key));
+          }
+        }
+        if (ivector_feats != NULL &&
+            (std::abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
+             || ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+        if (ProcessFile(splitter_config, tmodel,
+                        feats, ivector_feats, supervision,
+                        key, compress, left_context, right_context, num_frames,
+                        num_frames_overlap, frame_subsampling_factor,
+                        &num_frames_written, &num_egs_written,
+                        &example_writer))
+          num_done++;
+        else {
+          KALDI_WARN << "Failed to process utterance into nnet example "
+                     << "for key " << key;
+          num_err++;
+        }
+      }
+    }
+
+    KALDI_LOG << "Finished generating nnet3-discriminative examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples, "
+              << " with " << num_frames_written << " frames in total; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
new file mode 100644
index 00000000000..5c386bd40b3
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-merge-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This copies nnet3 discriminative training examples from input to output, merging them\n"
+        "into composite examples.  The --minibatch-size option controls how many egs\n"
+        "are merged into a single output eg.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
+        "See also nnet3-discriminative-copy-egs\n";
+
+    bool compress = false;
+    int32 minibatch_size = 64;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+
+    std::vector<NnetDiscriminativeExample> examples;
+    examples.reserve(minibatch_size);
+
+    int64 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetDiscriminativeExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        NnetDiscriminativeExample merged_eg;
+        MergeDiscriminativeExamples(compress, &examples, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+        examples.clear();
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
new file mode 100644
index 00000000000..2a029123852
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
@@ -0,0 +1,116 @@
+// nnet3bin/nnet3-discriminative-shuffle-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy nnet3 discriminative training examples from the input to output,\n"
+        "while randomly shuffling the order.  This program will keep all of the examples\n"
+        "in memory at once, unless you use the --buffer-size option\n"
+        "\n"
+        "Usage:  nnet3-discriminative-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "nnet3-discriminative-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
+
+    int32 srand_seed = 0;
+    int32 buffer_size = 0;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
+                "to do limited-memory partial randomization.  Otherwise, do "
+                "full randomization.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    int64 num_done = 0;
+
+    std::vector<std::pair<std::string, NnetDiscriminativeExample*> > egs;
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    if (buffer_size == 0) { // Do full randomization
+      // Putting in an extra level of indirection here to avoid excessive
+      // computation and memory demands when we have to resize the vector.
+
+      for (; !example_reader.Done(); example_reader.Next())
+        egs.push_back(std::pair<std::string, NnetDiscriminativeExample*>(
+            example_reader.Key(),
+            new NnetDiscriminativeExample(example_reader.Value())));
+
+      std::random_shuffle(egs.begin(), egs.end());
+    } else {
+      KALDI_ASSERT(buffer_size > 0);
+      egs.resize(buffer_size,
+                 std::pair<std::string, NnetDiscriminativeExample*>("", NULL));
+      for (; !example_reader.Done(); example_reader.Next()) {
+        int32 index = RandInt(0, buffer_size - 1);
+        if (egs[index].second == NULL) {
+          egs[index] = std::pair<std::string, NnetDiscriminativeExample*>(
+              example_reader.Key(),
+              new NnetDiscriminativeExample(example_reader.Value()));
+        } else {
+          example_writer.Write(egs[index].first, *(egs[index].second));
+          egs[index].first = example_reader.Key();
+          *(egs[index].second) = example_reader.Value();
+          num_done++;
+        }
+      }
+    }
+    for (size_t i = 0; i < egs.size(); i++) {
+      if (egs[i].second != NULL) {
+        example_writer.Write(egs[i].first, *(egs[i].second));
+        delete egs[i].second;
+        num_done++;
+      }
+    }
+
+    KALDI_LOG << "Shuffled order of " << num_done
+              << " neural-network training examples "
+              << (buffer_size ? "using a buffer (partial randomization)" : "");
+
+    return (num_done == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-train.cc b/src/nnet3bin/nnet3-discriminative-train.cc
new file mode 100644
index 00000000000..ad088a4b618
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-train.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3 neural network parameters with discriminative sequence objective \n"
+        "gradient descent.  Minibatches are to be created by nnet3-discriminative-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-discriminative-train [options] <nnet-in> <discriminative-training-examples-in> <raw-nnet-out>\n"
+        "\n"
+        "nnet3-discriminative-train 1.mdl 'ark:nnet3-merge-egs 1.degs ark:-|' 2.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetDiscriminativeOptions opts;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        model_wxfilename = po.GetArg(3);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    bool binary;
+    Input ki(model_rxfilename, &binary);
+    
+    tmodel.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    
+    Nnet nnet = am_nnet.GetNnet();
+    const VectorBase<BaseFloat> &priors = am_nnet.Priors();
+
+    NnetDiscriminativeTrainer trainer(opts, tmodel, priors, &nnet);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      trainer.Train(example_reader.Value());
+
+    bool ok = trainer.PrintTotalStats();
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    Output ko(model_wxfilename, binary_write);
+    nnet.Write(ko.Stream(), binary_write);
+    
+    KALDI_LOG << "Wrote raw nnet model to " << model_wxfilename;
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-modify-learning-rates.cc b/src/nnet3bin/nnet3-modify-learning-rates.cc
new file mode 100644
index 00000000000..89e14a5e819
--- /dev/null
+++ b/src/nnet3bin/nnet3-modify-learning-rates.cc
@@ -0,0 +1,186 @@
+// nnet3bin/nnet3-modify-learning-rates.cc
+
+// Copyright 2013  Guoguo Chen
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This program modifies the learning rates so as to equalize the\n"
+        "relative changes in parameters for each layer, while keeping their\n"
+        "geometric mean the same (or changing it to a value specified using\n"
+        "the --average-learning-rate option).\n"
+        "\n"
+        "Usage: nnet3-modify-learning-rates [options] <prev-model> \\\n"
+        "                                  <cur-model> <modified-cur-model>\n"
+        "e.g.: nnet-modify-learning-rates --average-learning-rate=0.0002 \\\n"
+        "                                 5.mdl 6.mdl 6.mdl\n";
+
+    bool binary_write = true;
+    bool retroactive = false;
+    BaseFloat average_learning_rate = 0.0;
+    BaseFloat first_layer_factor = 1.0;
+    BaseFloat last_layer_factor = 1.0;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("average-learning-rate", &average_learning_rate,
+                "If supplied, change learning rate geometric mean to the given "
+                "value.");
+    po.Register("first-layer-factor", &first_layer_factor, "Factor that "
+                "reduces the target relative learning rate for first layer.");
+    po.Register("last-layer-factor", &last_layer_factor, "Factor that "
+                "reduces the target relative learning rate for last layer.");
+    po.Register("retroactive", &retroactive, "If true, scale the parameter "
+                "differences as well.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    KALDI_ASSERT(average_learning_rate >= 0);
+
+    std::string prev_nnet_rxfilename = po.GetArg(1),
+        cur_nnet_rxfilename = po.GetArg(2),
+        modified_cur_nnet_rxfilename = po.GetOptArg(3);
+
+    TransitionModel trans_model;
+    Nnet prev_nnet, cur_nnet;
+    {
+      bool binary_read;
+      Input ki(prev_nnet_rxfilename, &binary_read);
+      prev_nnet.Read(ki.Stream(), binary_read);
+    }
+    {
+      bool binary_read;
+      Input ki(cur_nnet_rxfilename, &binary_read);
+      cur_nnet.Read(ki.Stream(), binary_read);
+    }
+
+    int32 ret = 0;
+
+    // Get info about magnitude of parameter change.
+    Nnet diff_nnet(prev_nnet);
+    AddNnet(cur_nnet, -1.0, &diff_nnet);
+    int32 num_updatable = NumUpdatableComponents(diff_nnet);
+    Vector<BaseFloat> dot_prod(num_updatable);
+    ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
+    dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
+    KALDI_LOG << "Parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    Vector<BaseFloat> baseline_prod(num_updatable);
+    ComponentDotProducts(prev_nnet, prev_nnet, &baseline_prod);
+    baseline_prod.ApplyPow(0.5);
+    dot_prod.DivElements(baseline_prod);
+    KALDI_LOG << "Relative parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    // If relative parameter difference for a certain is zero, set it to the
+    // mean of the rest values.
+    int32 num_zero = 0;
+    for (int32 i = 0; i < num_updatable; i++) {
+      if (dot_prod(i) == 0.0) {
+        num_zero++;
+      }
+    }
+    
+    if (num_zero > 0) {
+      BaseFloat average_diff = dot_prod.Sum()
+        / static_cast<BaseFloat>(num_updatable - num_zero);
+      for (int32 i = 0; i < num_updatable; i++) {
+        if (dot_prod(i) == 0.0) {
+          dot_prod(i) = average_diff;
+        }
+      }
+      KALDI_LOG << "Zeros detected in the relative parameter difference "
+        << "vector, updating the vector to " << dot_prod ;
+    }
+
+    // Gets learning rates for previous neural net.
+    Vector<BaseFloat> prev_nnet_learning_rates(num_updatable),
+                      cur_nnet_learning_rates(num_updatable);
+    GetLearningRates(prev_nnet, &prev_nnet_learning_rates);
+    GetLearningRates(cur_nnet, &cur_nnet_learning_rates);
+    KALDI_LOG << "Learning rates for previous model per layer are "
+              << prev_nnet_learning_rates;
+    KALDI_LOG << "Learning rates for current model per layer are "
+              << cur_nnet_learning_rates;
+    
+    // Gets target geometric mean.
+    BaseFloat target_geometric_mean = 0.0; 
+    if (average_learning_rate == 0.0) {
+      target_geometric_mean = Exp(cur_nnet_learning_rates.SumLog()
+                                  / static_cast<BaseFloat>(num_updatable));
+    } else {
+      target_geometric_mean = average_learning_rate;
+    }
+    KALDI_ASSERT(target_geometric_mean > 0.0);
+
+    // Works out the new learning rates.  We start from the previous model;
+    // this ensures that if this program is run twice, we get consistent
+    // results even if it's overwritten the current model.
+    Vector<BaseFloat> nnet_learning_rates(prev_nnet_learning_rates);
+    nnet_learning_rates.DivElements(dot_prod);
+    KALDI_ASSERT(last_layer_factor > 0.0);
+    nnet_learning_rates(num_updatable - 1) *= last_layer_factor;
+    KALDI_ASSERT(first_layer_factor > 0.0);
+    nnet_learning_rates(0) *= first_layer_factor;
+    BaseFloat cur_geometric_mean = Exp(nnet_learning_rates.SumLog()
+                                 / static_cast<BaseFloat>(num_updatable));
+    nnet_learning_rates.Scale(target_geometric_mean / cur_geometric_mean);
+    KALDI_LOG << "New learning rates for current model per layer are "
+              << nnet_learning_rates;
+
+    // Changes the parameter differences if --retroactivate is set to true.
+    if (retroactive) {
+      Vector<BaseFloat> scale_factors(nnet_learning_rates);
+      scale_factors.DivElements(prev_nnet_learning_rates);
+      AddNnet(prev_nnet, -1.0, &cur_nnet);
+      ScaleNnetComponents(scale_factors, &cur_nnet);
+      AddNnet(prev_nnet, 1.0, &cur_nnet);
+      KALDI_LOG << "Scale parameter difference retroactively. Scaling factors "
+                << "are " << scale_factors;
+    }
+
+    // Sets learning rates and writes updated model.
+    SetLearningRates(nnet_learning_rates, &cur_nnet);
+
+    Output ko(modified_cur_nnet_rxfilename, binary_write);
+    cur_nnet.Write(ko.Stream(), binary_write);
+
+    return ret;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
index 8cc065add4f..1d804f971c0 100644
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ b/src/nnetbin/nnet-train-frmshuff.cc
@@ -226,6 +226,39 @@ int main(int argc, char *argv[]) {
         // apply optional feature transform
         nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
 
+        // remove frames with '0' weight from training,
+        {
+          // are there frames to be removed?
+          if (!weights.Min() > 0.0) { 
+            // create vector with frame-indices to keep,
+            std::vector<MatrixIndexT> keep_frames;
+            for (int32 i=0; i<weights.Dim(); i++) {
+              if (weights(i) > 0.0) 
+                keep_frames.push_back(i);
+            }
+            if (keep_frames.size() == 0) continue; // all frames removed, skip sentence,
+ 
+            // filter feature-frames,
+            CuMatrix<BaseFloat> tmp_feats(keep_frames.size(), feats_transf.NumCols());
+            tmp_feats.CopyRows(feats_transf, CuArray<MatrixIndexT>(keep_frames));
+            tmp_feats.Swap(&feats_transf);
+
+            // filter targets,
+            Posterior tmp_targets;
+            for (int32 i=0; i<keep_frames.size(); i++) {
+              tmp_targets.push_back(targets[keep_frames[i]]);
+            }
+            tmp_targets.swap(targets);
+
+            // filter weights,
+            Vector<BaseFloat> tmp_weights(keep_frames.size());
+            for (int32 i=0; i<keep_frames.size(); i++) {
+              tmp_weights(i) = weights(keep_frames[i]);
+            }
+            tmp_weights.Swap(&weights);
+          }
+        }
+
         // pass data to randomizers
         KALDI_ASSERT(feats_transf.NumRows() == targets.size());
         feature_randomizer.AddData(feats_transf);
diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index 41c9ca4c14d..8bec6cd9ab9 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -71,10 +71,10 @@ struct OnlineGmmDecodingAdaptationPolicyConfig {
     opts->Register("adaptation-first-utt-ratio", &adaptation_first_utt_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for first "
                    "utterance of each speaker");
-    opts->Register("adaptation-delay", &adaptation_first_utt_delay,
+    opts->Register("adaptation-delay", &adaptation_delay,
                    "Delay before first basis-fMLLR adaptation for not-first "
                    "utterances of each speaker");
-    opts->Register("adaptation-ratio", &adaptation_first_utt_ratio,
+    opts->Register("adaptation-ratio", &adaptation_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for "
                    "not-first utterances of each speaker");
   }
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
index eb1eb366f51..528a6ac240d 100644
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ b/src/transform/decodable-am-diag-gmm-regtree.h
@@ -54,7 +54,7 @@ class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
                                          trans_model_.TransitionIdToPdf(tid));
   }
 
-  virtual int32 NumFramesReady() { return feature_matrix_.NumRows(); }
+  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
 
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
@@ -103,7 +103,7 @@ class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
   const TransitionModel *TransModel() { return &trans_model_; }
-  
+
  protected:
   virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
 
diff --git a/src/transform/regtree-fmllr-diag-gmm.h b/src/transform/regtree-fmllr-diag-gmm.h
index f9ccb615039..9130850ab8c 100644
--- a/src/transform/regtree-fmllr-diag-gmm.h
+++ b/src/transform/regtree-fmllr-diag-gmm.h
@@ -44,7 +44,7 @@ struct RegtreeFmllrOptions {
 
   RegtreeFmllrOptions(): update_type("full"), min_count(1000.0),
                          num_iters(10), use_regtree(true) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("fmllr-update-type", &update_type,
                    "Update type for fMLLR (\"full\"|\"diag\"|\"offset\"|\"none\")");
@@ -138,7 +138,7 @@ typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
             RandomAccessRegtreeFmllrDiagGmmReader;
 typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
             RandomAccessRegtreeFmllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;  
+typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;
 
 /** \class RegtreeFmllrDiagGmmAccs
  *  Class for computing the accumulators needed for the maximum-likelihood
@@ -154,7 +154,10 @@ class RegtreeFmllrDiagGmmAccs {
   void SetZero();
 
   /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work with multiple feature transforms.
+  /// This does not work if the features have already been transformed
+  /// with multiple feature transforms (so you can't use use this to
+  /// do a 2nd pass of regression-tree fMLLR estimation, which as I write
+  /// (Dan, 2016) I'm not sure that this framework even supports.
   BaseFloat AccumulateForGmm(const RegressionTree &regtree,
                              const AmDiagGmm &am,
                              const VectorBase<BaseFloat> &data,
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 680e37a60b4..53ac392ac0e 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -55,7 +55,7 @@ fi
   [ ! -z ${IRSTLM} ] && \
     echo >&2 "IRSTLM config is already in env.sh" && exit
 
-  wd=`readlink -f $wd || pwd`
+  wd=`readlink -f $wd 2>/dev/null || pwd`
 
   echo "export IRSTLM=$wd/irstlm"
   echo "export PATH=\${PATH}:\${IRSTLM}/bin"
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
index e142341f5ba..858a61160fa 100644
--- a/tools/extras/openfstwin-1.3.4.patch
+++ b/tools/extras/openfstwin-1.3.4.patch
@@ -1,425 +1,425 @@
-diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
-index 5ad3b52..d9c0ca6 100644
---- a/src/include/fst/fst.h
-+++ b/src/include/fst/fst.h
-@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
- 
- namespace fst {
- 
-+	typedef ::int64 int64;
-+	typedef ::uint64 uint64;
-+	typedef ::int32 int32;
-+	typedef ::uint32 uint32;
-+
-+
- bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
- 
- class FstHeader;
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)
+diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
+index 5ad3b52..d9c0ca6 100644
+--- a/src/include/fst/fst.h
++++ b/src/include/fst/fst.h
+@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
+ 
+ namespace fst {
+ 
++	typedef ::int64 int64;
++	typedef ::uint64 uint64;
++	typedef ::int32 int32;
++	typedef ::uint32 uint32;
++
++
+ bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
+ 
+ class FstHeader;
+diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
+index c4362f2..58cad44 100644
+--- a/src/include/fst/interval-set.h
++++ b/src/include/fst/interval-set.h
+@@ -37,38 +37,38 @@ template <typename T>
+ class IntervalSet {
+  public:
+   struct Interval {
+-    T begin;
+-    T end;
++    T begin_;
++    T end_;
+ 
+-    Interval() : begin(-1), end(-1) {}
++    Interval() : begin_(-1), end_(-1) {}
+ 
+-    Interval(T b, T e) : begin(b), end(e) {}
++    Interval(T b, T e) : begin_(b), end_(e) {}
+ 
+     bool operator<(const Interval &i) const {
+-      return begin < i.begin || (begin == i.begin && end > i.end);
++      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
+     }
+ 
+     bool operator==(const Interval &i) const {
+-      return begin == i.begin && end == i.end;
++      return begin_ == i.begin_ && end_ == i.end_;
+     }
+ 
+     bool operator!=(const Interval &i) const {
+-      return begin != i.begin || end != i.end;
++      return begin_ != i.begin_ || end_ != i.end_;
+     }
+ 
+     istream &Read(istream &strm) {
+       T n;
+       ReadType(strm, &n);
+-      begin = n;
++      begin_ = n;
+       ReadType(strm, &n);
+-      end = n;
++      end_ = n;
+       return strm;
+     }
+ 
+     ostream &Write(ostream &strm) const {
+-      T n = begin;
++      T n = begin_;
+       WriteType(strm, n);
+-      n = end;
++      n = end_;
+       WriteType(strm, n);
+       return strm;
+     }
+@@ -108,7 +108,7 @@ class IntervalSet {
+         lower_bound(intervals_.begin(), intervals_.end(), interval);
+     if (lb == intervals_.begin())
+       return false;
+-    return (--lb)->end > value;
++    return (--lb)->end_ > value;
+   }
+ 
+   // Requires intervals be normalized.
+@@ -123,7 +123,7 @@ class IntervalSet {
+ 
+   bool Singleton() const {
+     return intervals_.size() == 1 &&
+-        intervals_[0].begin + 1 == intervals_[0].end;
++        intervals_[0].begin_ + 1 == intervals_[0].end_;
+   }
+ 
+ 
+@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
+   T size = 0;
+   for (T i = 0; i < intervals_.size(); ++i) {
+     Interval &inti = intervals_[i];
+-    if (inti.begin == inti.end)
++    if (inti.begin_ == inti.end_)
+       continue;
+     for (T j = i + 1; j < intervals_.size(); ++j) {
+       Interval &intj = intervals_[j];
+-      if (intj.begin > inti.end)
++      if (intj.begin_ > inti.end_)
+         break;
+-      if (intj.end > inti.end)
+-        inti.end = intj.end;
++      if (intj.end_ > inti.end_)
++        inti.end_ = intj.end_;
+       ++i;
+     }
+-    count_ += inti.end - inti.begin;
++    count_ += inti.end_ - inti.begin_;
+     intervals_[size++] = inti;
+   }
+   intervals_.resize(size);
+@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
+   oset->count_ = 0;
+ 
+   while (it1 != intervals_.end() && it2 != iintervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       Interval interval;
+-      interval.begin = max(it1->begin, it2->begin);
+-      interval.end = min(it1->end, it2->end);
++      interval.begin_ = max(it1->begin_, it2->begin_);
++      interval.end_ = min(it1->end_, it2->end_);
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
+-      if (it1->end < it2->end)
++      oset->count_ += interval.end_ - interval.begin_;
++      if (it1->end_ < it2->end_)
+         ++it1;
+       else
+         ++it2;
+@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
+   oset->count_ = 0;
+ 
+   Interval interval;
+-  interval.begin = 0;
++  interval.begin_ = 0;
+   for (typename vector<Interval>::const_iterator it = intervals_.begin();
+        it != intervals_.end();
+        ++it) {
+-    interval.end = min(it->begin, maxval);
+-    if (interval.begin < interval.end) {
++    interval.end_ = min(it->begin_, maxval);
++    if (interval.begin_ < interval.end_) {
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
++      oset->count_ += interval.end_ - interval.begin_;
+     }
+-    interval.begin = it->end;
++    interval.begin_ = it->end_;
+   }
+-  interval.end = maxval;
+-  if (interval.begin < interval.end) {
++  interval.end_ = maxval;
++  if (interval.begin_ < interval.end_) {
+     ointervals->push_back(interval);
+-    oset->count_ += interval.end - interval.begin;
++    oset->count_ += interval.end_ - interval.begin_;
+   }
+ }
+ 
+@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
+     oset->count_ = 0;
+   } else {
+     IntervalSet<T> cset;
+-    iset.Complement(intervals_.back().end, &cset);
++    iset.Complement(intervals_.back().end_, &cset);
+     Intersect(cset, oset);
+   }
+ }
+@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       return true;
+@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
+   bool overlap = false; // point in both intervals_ and intervals
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       only1 = true;
+       ++it1;
+-    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
++    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
+       only2 = true;
+       ++it2;
+-    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
++    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
+       overlap = true;
+       ++it1;
+       ++it2;
+-    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
++    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
+       only2 = true;
+       overlap = true;
+       ++it1;
+-    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
++    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
+       only1 = true;
+       overlap = true;
+       ++it2;
+@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       ++it1;
+-    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
++    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
+       return false;
+-    } else if (it2->end == it1->end) {
++    } else if (it2->end_ == it1->end_) {
+       ++it1;
+       ++it2;
+     } else {
+@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
+        ++it) {
+     if (it != intervals->begin())
+       strm << ",";
+-    strm << "[" << it->begin << "," << it->end << ")";
++    strm << "[" << it->begin_ << "," << it->end_ << ")";
+   }
+   strm << "}";
+   return strm;
+diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
+index a7c3360..491ef7d 100644
+--- a/src/include/fst/label-reachable.h
++++ b/src/include/fst/label-reachable.h
+@@ -359,9 +359,9 @@ class LabelReachable {
+                iiter = intervals->begin();
+            iiter != intervals->end(); ++iiter) {
+         begin_low = LowerBound(aiter, end_low, aiter_end,
+-                               aiter_input, iiter->begin);
++                               aiter_input, iiter->begin_);
+         end_low = LowerBound(aiter, begin_low, aiter_end,
+-                             aiter_input, iiter->end);
++                             aiter_input, iiter->end_);
+         if (end_low - begin_low > 0) {
+           if (reach_begin_ < 0)
+             reach_begin_ = begin_low;
+diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
+index 3fbe3ba..6e9dd3d 100644
+--- a/src/include/fst/minimize.h
++++ b/src/include/fst/minimize.h
+@@ -134,7 +134,14 @@ class CyclicMinimizer {
+   typedef typename A::Weight Weight;
+   typedef ReverseArc<A> RevA;
+ 
+-  CyclicMinimizer(const ExpandedFst<A>& fst) {
++  CyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      P_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // this minimization
++    // algorithm for non-deterministic FSTs can only work with idempotent
++    // semirings.
+     Initialize(fst);
+     Compute(fst);
+   }
+@@ -315,7 +322,13 @@ class AcyclicMinimizer {
+   typedef typename A::StateId ClassId;
+   typedef typename A::Weight Weight;
+ 
+-  AcyclicMinimizer(const ExpandedFst<A>& fst) {
++  AcyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      partition_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // minimization for
++    // non-deterministic FSTs can only work with idempotent semirings.
+     Initialize(fst);
+     Refine(fst);
+   }
+@@ -531,13 +544,7 @@ template <class A>
+ void Minimize(MutableFst<A>* fst,
+               MutableFst<A>* sfst = 0,
+               float delta = kDelta) {
+-  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
+-                                 kWeighted | kUnweighted, true);
+-  if (!(props & kIDeterministic)) {
+-    FSTERROR() << "FST is not deterministic";
+-    fst->SetProperties(kError, kError);
+-    return;
+-  }
++  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
+ 
+   if (!(props & kAcceptor)) {  // weighted transducer
+     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
+diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
+index dcee67b..40b849a 100644
+--- a/src/include/fst/partition.h
++++ b/src/include/fst/partition.h
+@@ -43,8 +43,8 @@ class Partition {
+   friend class PartitionIterator<T>;
+ 
+   struct Element {
+-   Element() : value(0), next(0), prev(0) {}
+-   Element(T v) : value(v), next(0), prev(0) {}
++    Element() : value(0), next(0), prev(0) {}
++    Element(T v) : value(v), next(0), prev(0) {}
+ 
+    T        value;
+    Element* next;
+@@ -52,9 +52,11 @@ class Partition {
+   };
+ 
+  public:
+-  Partition() {}
++  Partition(bool allow_repeated_split):
++      allow_repeated_split_(allow_repeated_split) {}
+ 
+-  Partition(T num_states) {
++  Partition(bool allow_repeated_split, T num_states):
++      allow_repeated_split_(allow_repeated_split) {
+     Initialize(num_states);
+   }
+ 
+@@ -137,16 +139,16 @@ class Partition {
+     if (class_size_[class_id] == 1) return;
+ 
+     // first time class is split
+-    if (split_size_[class_id] == 0)
++    if (split_size_[class_id] == 0) { 
+       visited_classes_.push_back(class_id);
+-
++      class_split_[class_id] = classes_[class_id];
++    }
+     // increment size of split (set of element at head of chain)
+     split_size_[class_id]++;
+-
++    
+     // update split point
+-    if (class_split_[class_id] == 0)
+-      class_split_[class_id] = classes_[class_id];
+-    if (class_split_[class_id] == elements_[element_id])
++    if (class_split_[class_id] != 0
++        && class_split_[class_id] == elements_[element_id])
+       class_split_[class_id] = elements_[element_id]->next;
+ 
+     // move to head of chain in same class
+@@ -157,24 +159,31 @@ class Partition {
+   // class indices of the newly created class. Returns the new_class id
+   // or -1 if no new class was created.
+   T SplitRefine(T class_id) {
++
++    Element* split_el = class_split_[class_id];
+     // only split if necessary
+-    if (class_size_[class_id] == split_size_[class_id]) {
+-      class_split_[class_id] = 0;
++    //if (class_size_[class_id] == split_size_[class_id]) {
++    if(split_el == NULL) { // we split on everything...
+       split_size_[class_id] = 0;
+       return -1;
+     } else {
+-
+       T new_class = AddClass();
++
++      if(allow_repeated_split_) { // split_size_ is possibly
++        // inaccurate, so work it out exactly.
++        size_t split_count;  Element *e;
++        for(split_count=0,e=classes_[class_id];
++            e != split_el; split_count++, e=e->next);
++        split_size_[class_id] = split_count;
++      }
+       size_t remainder = class_size_[class_id] - split_size_[class_id];
+       if (remainder < split_size_[class_id]) {  // add smaller
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = split_el;
+-        class_size_[class_id] = split_size_[class_id];
+-        class_size_[new_class] = remainder;
+         split_el->prev->next = 0;
+         split_el->prev = 0;
++        class_size_[class_id] = split_size_[class_id];
++        class_size_[new_class] = remainder;
+       } else {
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = classes_[class_id];
+         class_size_[class_id] = remainder;
+         class_size_[new_class] = split_size_[class_id];
+@@ -245,10 +254,16 @@ class Partition {
+   vector<T> class_size_;
+ 
+   // size of split for each class
++  // in the nondeterministic case, split_size_ is actually an upper
++  // bound on the size of split for each class.
+   vector<T> split_size_;
+ 
+   // set of visited classes to be used in split refine
+   vector<T> visited_classes_;
++
++  // true if input fst was deterministic: we can make
++  // certain assumptions in this case that speed up the algorithm.
++  bool allow_repeated_split_;
+ };
+ 
+ 
+diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
+index 6d0c971..1da922e 100644
+--- a/src/include/fst/state-reachable.h
++++ b/src/include/fst/state-reachable.h
+@@ -112,7 +112,7 @@ class IntervalReachVisitor {
+   void FinishState(StateId s, StateId p, const A *arc) {
+     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
+       vector<Interval> *intervals = (*isets_)[s].Intervals();
+-      (*intervals)[0].end = index_;      // Update tree interval end
++      (*intervals)[0].end_ = index_;      // Update tree interval end
+     }
+     (*isets_)[s].Normalize();
+     if (p != kNoStateId)
diff --git a/windows/INSTALL b/windows/INSTALL
deleted file mode 100644
index d743129498b..00000000000
--- a/windows/INSTALL
+++ /dev/null
@@ -1,146 +0,0 @@
-
-# Installation instructions for native Windows with Visual
-# studio (for cygwin installation, see the instructions 
-# in ../INSTALL).
-
-#NOTE: These instructions are valid June 2015, MKL and OpenBLAS are supported
-#NOTE: ATLAS is not supported and I personally have no intention to work on supporting
-#      it, as it requires whole cygwin environment
-#NOTE: We now (20150613) support CUDA on Windows as well. The build was 
-#      tested on CUDA 7.0. It is possible that the compilation fails
-#      for significantly older CUDA SDK (less than, say, 5.0)
-#      Please not that CUDA support for windows is not really that usefull,
-#      because, the speed benefit during decoding is not large. And for training
-#      one would have to re-implement the while training pipeline (as the 
-#      bash script wouldn't most probably work) 
-#NOTE: While the 32bit project files will still be generated, we don't really
-#      care if they work or not. They will be removed in the near future.
-#NOTE: The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
-#NOTE: We support only openfst-1.3.x for now.
-#NOTE: I suggest to have git installed -- not only because we will 
-#      use it to download the source codes (you could download archives
-#      instead of it), but also because the windows version comes
-#      with a bunch of useful utilities. 
-#NOTE: The examples will assume you have installed the git for windows
-#      and during the installation you chose the GIT Shell to install as well.
-#      Moreover, all the commands are issued from the same session
-
-1) Checkout Kaldi trunk, either using the svn from the url
-   https://svn.code.sf.net/p/kaldi/code/trunk
-   or using git from 
-   https://github.com/kaldi-asr/kaldi.git
-   Example:
-     $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
-
-2) enter the (kaldi)/tools directory in the freshly 
-   checked-out kaldi repo. All following actions should
-   be taken in the tools dir
-   Example:
-     $ cd (kaldi)/tools
-	 (kaldi)/tools$ pwd
-   
-   
-2a) Use git to clone the OpenFST(win) from
-   https://github.com/jtrmal/openfstwin-1.3.4.git
-   Example:
-     (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
-   
-   
-2b) Download pthread-win32 (or wget or curl)
-   https://sourceforge.net/projects/pthreads4w/
-     (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
-	 (kaldi)/tools$ mkdir pthreads; cd pthreads
-	 (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
-
-2c) Use patch (or you can use git patch) to patch the OpenFST(win)
-   patch location tools/extras/openfstwin-1.3.4.patch,
-   Example:
-     (kaldi)/tools$ cd openfst
-	 (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch 
-
-2d-1) Download the OpenBLAS binary packages
-      https://sourceforge.net/projects/openblas
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
-	  (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ unzip mingw64_dll.zip
-	  
-	  NOTE: Be carefull to download "Win64-int32" and not "Win64-int64"!
-	  
-2d-2) Install MKL
-2e) If you want enabled CUDA support, download and install NVidia CUDA SDK.
-    Be careful and strive for as standard install as possible. The installer
-	set certain environment variables on which the MSVC Build rules rely.
-	If you call "set" in the command line, you should see:
-    
-	(kaldi)/tools $ set | grep CUDA
-     CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
-     CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
-	
-    The first one (CUDA_PATH) is particulary important.
- 	
-3)  Open the OpenFST solution in VS -- 
-   for VS 2013, the correct solution is in VS2012 directory
-   for VS 2014, the correct solution is in VS2014 directory
-   !!!switch the configuration to debug|x64 and build the solution
-   !!!The same for configuration release|x64
-   If either of the two won't build, you should stop here and start figuring what's different!
-
-4)  Enter the (kaldi)/windows directory
-   Example:
-	 (kaldi)/tools/openfst$ cd ../../windows
-	 (kaldi)/windows $ pwd
-	 
-4a) modify the file variables.props to reflect 
-    the correct paths, using your favorite text editor.
-	Don't worry, it's a text file, even though you have to be 
-	careful to keep the structure itself intact
-	(kaldi)/windows $ vim variables.props
-	
-	If you plan to use MKL, you can ignore the OPENBLASDIR path
-	If you plan to use OpenBLAS, you can ignore the MKLDIR path
-	No matter what you plan to use, set both the OPENFST* and PTHREADW
-	variables correctly
-	
-4b-1) For OpenBLAS support, copy the file "kaldiwin_openblas.props" to "kaldiwin.props"
-4b-2) For MKL support, you don't have to do anything, it should work out of the box. 
-      When you need to switch from OpenBLAS to MKL, copy the "kaldiwin_mkl.props" 
-	  to "kaldiwin.props"
-
-
-4c) call the script that generates the MSVC solution
-	i.e.
-	generate_solution.pl --vsver <default|vs2013|vs2015>
-	i.e. for example
-	generate_solution.pl --vsver vs2013
-	
-	For CUDA support, add switch --enable-cuda to the command line,
-	i.e. for example
-	generate_solution.pl --vsver vs2013 --enable-cuda
-	
-5)  Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build
-   Expect 10 projects to fail, majority of them will fail because of missing include "portaudio.h"
-
-------  	
-NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
-(B) either
-   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
-  kaldiwin_atlas.props  to kaldiwin.props
-
-(D)
-If you had installed ATLAS, you next have to do this:
-[assuming you are one level above this directory]
-cd kaldiwin_vs10_auto/
-
-# type the following (these commands were done from cygwin): note that these
-# commands are a bit wasteful of disk; you could alternatively ensure that
-# [root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
-# run the binaries.
-
-mkdir -p Debug Release
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
-
-Then build the project with Visual Studio.
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
new file mode 100644
index 00000000000..47727328a01
--- /dev/null
+++ b/windows/INSTALL.md
@@ -0,0 +1,181 @@
+
+# Installation instructions for native Windows with Visual Studio
+
+For cygwin installation, see the instructions in `../INSTALL`.
+
+## Notes
+
+* These instructions are valid June 2015, [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS are supported
+* ATLAS is not supported and I personally have no intention to work on supporting
+  it, as it requires whole cygwin environment
+* We now (20150613) support CUDA on Windows as well. The build was
+  tested on CUDA 7.0. It is possible that the compilation fails
+  for significantly older CUDA SDK (less than, say, 5.0)
+  Please not that CUDA support for windows is not really that usefull,
+  because, the speed benefit during decoding is not large. And for training
+  one would have to re-implement the while training pipeline (as the
+  bash script wouldn't most probably work)
+* While the 32bit project files will still be generated, we don't really
+  care if they work or not. They will be removed in the near future.
+* The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
+* We support only openfst-1.3.x for now.
+* I suggest to have git installed -- not only because we will
+  use it to download the source codes (you could download archives
+  instead of it), but also because the windows version comes
+  with a bunch of useful utilities.
+* The examples will assume you have installed the git for windows
+  and during the installation you chose the GIT Shell to install as well.
+  Moreover, all the commands are issued from the same session.
+
+## Steps
+
+1. Checkout Kaldi trunk, either using the svn from the url https://svn.code.sf.net/p/kaldi/code/trunk
+   or using git from https://github.com/kaldi-asr/kaldi.git
+
+   Example:
+   
+        $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
+
+2. Enter the `(kaldi)/tools` directory in the freshly
+   checked-out kaldi repo. All following actions should
+   be taken in the tools dir.
+
+   Example:
+   
+        $ cd (kaldi)/tools
+        (kaldi)/tools$ pwd
+
+3. Use git to clone the [OpenFST(win)](https://github.com/jtrmal/openfstwin-1.3.4) from
+       
+        https://github.com/jtrmal/openfstwin-1.3.4.git
+
+   Example:
+   
+        (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
+
+4. Download [pthreads-win32](https://sourceforge.net/projects/pthreads4w/) (or `wget` or `curl`)
+
+   https://sourceforge.net/projects/pthreads4w/
+
+        (kaldi)/tools$ curl -L -O http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
+        (kaldi)/tools$ mkdir pthreads; cd pthreads
+        (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
+
+5. Use patch (or you can use git patch) to patch the OpenFST(win).
+
+   The patch location is `tools/extras/openfstwin-1.3.4.patch`
+
+   Example:
+   
+        (kaldi)/tools$ cd openfst
+        (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch
+
+   If you get this error: `Assertion failed: hunk, file ../patch-2.5.9-src/patch.c, line 354`
+   it is because the `patch.c` file should have Windows line endings (CRLF) rather than Unix ones (LF).
+   
+There are two options to use for BLAS (linear algebra): [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS. [Intel® MKL](https://software.intel.com/en-us/intel-mkl) is made by Intel and is optimised
+for their processors. It isn't free, but you can get [Community Licensing for Intel® Performance Libraries
+](https://software.intel.com/sites/campaigns/nest/) or as part of Intel product suite if you [qualify as students, educators, academic researchers, and open source contributors](https://software.intel.com/en-us/qualify-for-free-software). OpenBLAS is free alternative with similar performance.
+
+6. If using [Intel® MKL](https://software.intel.com/en-us/intel-mkl), [install it](https://software.intel.com/en-us/intel-mkl/try-buy).
+
+7. If using OpenBLAS, download the binary packages.
+
+   https://sourceforge.net/projects/openblas
+
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
+        (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ unzip mingw64_dll.zip
+
+   **Be careful to download "Win64-int32" and not "Win64-int64"!**
+
+8. If you want enabled [CUDA](http://www.nvidia.com/object/cuda_home_new.html) support, download and install [NVIDIA CUDA SDK](https://developer.nvidia.com/cuda-downloads).
+   Be careful and strive for as standard install as possible. The installer
+   set certain environment variables on which the MSVC Build rules rely.
+   If you call "set" in the command line, you should see:
+
+        (kaldi)/tools $ set | grep CUDA
+        CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
+        CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
+
+   The first one (`CUDA_PATH`) is particularly important.
+
+9. Open the OpenFST solution in Visual Studio
+
+   * for [Visual Studio 2013](https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx), the correct solution is in `MSVC12` directory
+   * for [Visual Studio 2015](https://www.visualstudio.com/en-us/products/visual-studio-community-vs.aspx), the correct solution is in `MSVC14` directory
+
+   **Switch the configuration to `debug|x64` and build the solution.**
+
+   **Do the same for configuration `release|x64`.**
+
+   If either of the two won't build, you should stop here and start figuring what's different!
+
+10. Enter the `(kaldi)/windows` directory
+
+    Example:
+    
+         (kaldi)/tools/openfst$ cd ../../windows
+         (kaldi)/windows $ pwd
+
+11. Modify the file `variables.props` to reflect
+    the correct paths, using your favorite text editor.
+    Don't worry, it's a text file, even though you have to be
+    careful to keep the structure itself intact
+
+         (kaldi)/windows $ vim variables.props
+
+    If you plan to use MKL, you can ignore the `OPENBLASDIR` path.
+    If you plan to use OpenBLAS, you can ignore the `MKLDIR` path.
+    No matter what you plan to use, set both the `OPENFST*` and `PTHREADW`
+    variables correctly
+
+12. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props`
+13. For MKL support, you don't have to do anything, it should work out of the box.
+    When you need to switch from OpenBLAS to MKL, copy the `kaldiwin_mkl.props`
+    to `kaldiwin.props`
+
+14. Call the script that generates the MSVC solution
+
+         generate_solution.pl --vsver <default|vs2013|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mlk]
+
+    `--enable-mlk` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MLK support.
+    CUDA is disabled by default. The default Visual Studio version is 11.0 (Visual Studio 2012).
+
+    For example, for a build using OpenBLAS and VS 2015 you would run:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2015 --enable-openblas
+
+    Another example, for OpenBLAS, VS 2013 and CUDA support:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2013 --enable-cuda --enable-openblas
+
+15. Open the generated solution in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
+   Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`
+
+------
+NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the `README.ATLAS`)
+
+(B) either
+   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
+  `kaldiwin_atlas.props` to `kaldiwin.props`
+
+(D)
+If you had installed ATLAS, you next have to do this:
+[assuming you are one level above this directory]
+
+    cd kaldiwin_vs10_auto/
+
+Type the following (these commands were done from cygwin): note that these
+commands are a bit wasteful of disk; you could alternatively ensure that
+`[root]/tools/ATLAS/cygwin_build/install/lib/` is always on your path when you
+run the binaries.
+
+    mkdir -p Debug Release
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
+
+Then build the project with Visual Studio.