diff --git a/.gitattributes b/.gitattributes
index 5a815654b4c..bede44edf8a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,4 +15,6 @@ windows/INSTALL*   eol=native
 windows/NewGuidCmd.exe.config text eol=crlf
 windows/NewGuidCmd.exe binary
 
+# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows).
+**/*.patch            -text
 
diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt
new file mode 100644
index 00000000000..71ab9f0fa45
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/README.txt
@@ -0,0 +1,12 @@
+
+there are a lot of tuning experiments here.
+
+ones to look at right now:
+  2y is a TDNN baseline
+  4f is a good jesus-layer system
+  4q is an improved TDNN with various bells and whistles from Vijay.
+  4r is a slightly-better jesus-layer system than 4f, with one more layer.
+  5e is the best configuration run so far.
+
+
+
diff --git a/egs/swbd/s5c/local/chain/run_discriminative.sh b/egs/swbd/s5c/local/chain/run_discriminative.sh
new file mode 100755
index 00000000000..f2b4da87920
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_discriminative.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+set -e 
+set -o pipefail
+
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+# Note: rather than using any features we have dumped on disk, this script
+# regenerates them from the wav data three times-- when we do lattice
+# generation, numerator alignment and discriminative training.  This made the
+# script easier to write and more generic, because we don't have to know where
+# the features and the iVectors are, but of course it's a little inefficient.
+# The time taken is dominated by the lattice generation anyway, so this isn't
+# a huge deal.
+
+. cmd.sh
+
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+use_gpu=true
+srcdir=exp/chain/tdnn_5e_sp
+criterion=smbr
+drop_frames=false  # only matters for MMI.
+frames_per_eg=150
+frames_overlap_per_eg=30
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+train_stage=-10 # can be used to start training in the middle.
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+num_epochs=4
+degs_dir=
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+regularization_opts=
+lats_dir=
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+one_silence_class=true
+truncate_deriv_weights=10
+minibatch_size=64
+
+adjust_priors=true
+
+determinize=true
+minimize=true
+remove_output_symbols=true
+remove_epsilons=true
+collapse_transition_ids=true
+
+modify_learning_rates=true
+last_layer_factor=1.0
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+lang=data/lang
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  use_gpu=no
+  gpu_opts=
+
+  steps/nnet3/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+     --online-ivector-dir $online_ivector_dir \
+     --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+     --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali || exit 1;
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative2.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
+      --self-loop-scale 1.0 --acwt 1.0 --extra-left-context 20 \
+      --online-ivector-dir $online_ivector_dir --determinize $determinize \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir $lang $srcdir ${lats_dir} || exit 1;
+  fi
+fi
+
+left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` || exit 1
+right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` || exit 1
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs_n${frames_per_eg}_o${frames_overlap_per_eg}_f
+  if $determinize; then
+    degs_dir=${degs_dir}d
+  fi
+  if $minimize; then
+    degs_dir=${degs_dir}m
+  fi
+  if $remove_output_symbols; then
+    degs_dir=${degs_dir}r
+  fi
+  if $remove_epsilons; then
+    degs_dir=${degs_dir}e
+  fi
+  if $collapse_transition_ids; then
+    degs_dir=${degs_dir}c
+  fi
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize $determinize --minimize $minimize --remove-output-symbols $remove_output_symbols --remove-epsilons $remove_epsilons --collapse-transition-ids $collapse_transition_ids"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors --acwt 1.0 \
+      --online-ivector-dir $online_ivector_dir --left-context $left_context --right-context $right_context $frame_subsampling_opt \
+      --criterion $criterion --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir $lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir || exit 1;
+  fi
+fi
+
+d=`basename $degs_dir`
+dir=${srcdir}_${criterion}_${effective_learning_rate}_degs${d##*degs}_ms${minibatch_size}
+
+if $one_silence_class; then
+  dir=${dir}_onesil
+fi
+
+if $modify_learning_rates; then
+  dir=${dir}_modify
+fi
+
+if [ "$last_layer_factor" != "1.0" ]; then
+  dir=${dir}_llf$last_layer_factor
+fi
+
+if [ $stage -le 4 ]; then
+  bash -x steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames $drop_frames --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+      ${degs_dir} $dir || exit 1;
+fi
+
+decode_suff=sw1_tg
+graph_dir=$srcdir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
+
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
index 2e08d5e22af..a8552244ed2 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
@@ -276,4 +276,4 @@ b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_s
 b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh ; done
 %WER 16.57 [ 8155 / 49204, 1144 ins, 1988 del, 5023 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
 %WER 16.83 [ 8282 / 49204, 1106 ins, 2115 del, 5061 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
-%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
\ No newline at end of file
+%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
index 4c0ac7e62ca..d17ebdf9be7 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
@@ -301,4 +301,4 @@ LOG (lattice-best-path:main():lattice-best-path.cc:99) For utterance sp1.0-sw028
 LOG (lattice-best-path:main():lattice-best-path.cc:124) Overall score per frame is 46.9461 = 0.0637047 [graph] + 46.8824 [acoustic] over 843 frames.
 LOG (lattice-best-path:main():lattice-best-path.cc:128) Done 1 lattices, failed for 0
 LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances.
-sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
\ No newline at end of file
+sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
new file mode 100755
index 00000000000..4f350891e8a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# _3c is as _2y, but using 'jesus' nonlinearity: the --jesus-dim 800 option, instead of
+#   --relu-dim 850.
+#  reusing the egs from 2y.
+# caution: see config section, I changed some things while running.
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3c  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+# max_param_change=1.0
+max_param_change=0.5  # Changed it to this value on iteration  74.
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64  # switched to 64 on iteration 7 after a failure.
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --jesus-dim 800 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
new file mode 100755
index 00000000000..ca8080db080
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+# (note: cannot be reproduced using current scripts).
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# Results are about the same as 2y, or maybe just a little worse.
+
+# a03:s5c: ./show_wer.sh 3d
+# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
new file mode 100755
index 00000000000..af5661b8c85
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+# (note: cannot be reproduced using current scripts).
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3e  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
new file mode 100755
index 00000000000..f33459f5f08
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# (note: cannot be reproduced using current scripts).
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3f  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
new file mode 100755
index 00000000000..ff1e539306f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
@@ -0,0 +1,303 @@
+#!/bin/bash
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# (note: cannot be reproduced using current scripts).
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3g  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
new file mode 100755
index 00000000000..f0e9efc2ac4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3h  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
new file mode 100755
index 00000000000..876048b5852
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor)
+
+# Here is the original decoding, with frame-per-chunk=50
+#./show_wer.sh 3i
+#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# and a newer decoding with frames-per-chunk=100.
+# ./show_wer.sh 3i
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# after initial decoding wasn't great, trying increasing frames-per-chunk from
+# 50 to 100.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3i  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 100 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
new file mode 100755
index 00000000000..faef84e8879
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3j  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
new file mode 100755
index 00000000000..b869c7b2553
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
new file mode 100755
index 00000000000..7a016ed2197
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+
+# 3k2 is as 3k, but dumping the egs with --extra-left-context 20.
+# Also there will have been some script changes in the meantime,
+# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up
+# iters.
+
+# log-probs are better than 3k and in fact better than any experiment so far:
+# valid -0.115->-0.107, and train -0.077 to -0.074.
+
+# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys
+
+#... and here is the WER after changing it to 150, still with --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse):
+#./show_wer.sh 3k2
+#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20.
+#./show_wer.sh 3k2
+#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100.
+# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging
+# deletions at that point, for reasons that relate to edge effects in the objective function.
+#./show_wer.sh 3k2
+#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k2  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --extra-left-context 20 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context-initial 20 \
+         --extra-left-context 50 \
+         --frames-per-chunk 100 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
new file mode 100755
index 00000000000..608e437659e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+
+# [abandoned, not working well.]
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3l  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
new file mode 100755
index 00000000000..b25f9f15130
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# [note: this uses BlockAffineComponent not RepeatedAffineComponent]
+# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers
+# were learning too slowly in 3l (this will make them learn approximately 4x faster).
+# [terminated, likelihoods were not promising].
+
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3m  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
new file mode 100755
index 00000000000..dedbd84be75
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
@@ -0,0 +1,305 @@
+#!/bin/bash
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3n  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
new file mode 100755
index 00000000000..14383fe1a32
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+# [ seemed helpful based on likelihoods on first iterations]: on iter 42,
+# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3o  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
new file mode 100755
index 00000000000..ddba7e7f9c5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# Comparing the WER with 2y, it's about 1% abs worse [see below].  However, this is
+# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million
+# parameters), seems to have a lot more learning capacity, with better train and worse valid
+# prob.  In 3r and 3s I am trying smaller versions of this architecture.
+
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#                        2y             3p
+#  final-train-prob:  -0.083068    -0.0771
+#  final-valid-prob:  -0.01212     -0.12715
+# num-parameters:      12094115     8804087
+
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3p  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
new file mode 100755
index 00000000000..9f67164b806
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+
+# _3q is as _3p, but now trying out the 'block' training script, where in addition to
+# the affine connections we have block-matrix connections between the layers.
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3q  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900  --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
new file mode 100755
index 00000000000..7815adffb9f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+# [I think I abandoned this after deciding to reduce the parameters even further,
+# to the setup in 3s].
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3r  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
new file mode 100755
index 00000000000..6cee8b11925
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3s  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
new file mode 100755
index 00000000000..25e30900e36
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
@@ -0,0 +1,336 @@
+#!/bin/bash
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+#  The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243.
+# WER is slightly worse.  So we won't use this for now, but later if we use more data we
+# could try wider context like this.
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#
+#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3t  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
new file mode 100755
index 00000000000..d1b93d9084c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim
+# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to
+# ensure that the number of parameters doesn't increase too much.
+#  [stopping this run, as the likelihoods weren't promising, e.g. by iteration
+#  39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532)
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3u  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
new file mode 100755
index 00000000000..c7fcb7e24f5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
@@ -0,0 +1,328 @@
+#!/bin/bash
+
+# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50.
+# I stopped it early after likelihoods were not promising:
+#  on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3v  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
new file mode 100755
index 00000000000..e4165e54de6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
@@ -0,0 +1,332 @@
+#!/bin/bash
+
+# _3w is as _3t but instead of having a rectangular affine component in each
+# layer, making it square (700->600 not 1300->400), and introducing a new script
+# option --final-hidden-dim to have something like a bottleneck at the last
+# layer, to avoid a blowup in parameters.
+#  (note: num-params was slightly smaller, 4.8 million vs 5.3
+#  I stopped this on iter 65 after likelihoods were not promising:
+# on iter 63, train -0.133->-0.138, valid -0.138->-0.141.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3w  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
new file mode 100755
index 00000000000..1585d209a93
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+
+# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)...
+#  increasing --jesus-forward-output-dim from 1500 to 2000.
+# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261,
+# WER effect is very tiny but maybe slightly better.
+#a03:s5c: ./show_wer.sh 3x
+#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3x  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
new file mode 100755
index 00000000000..042ec84898b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000.
+#  not promising: by iteration 228, train prob changed -0.09583->-0.09575, and
+# valid prob from -0.1213 -> -0.1239.  Killed it.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 3s.
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3y  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
new file mode 100755
index 00000000000..f1fa2c5a45e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
@@ -0,0 +1,350 @@
+#!/bin/bash
+
+# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k.
+# A slight degradation in WER, but it's not 100% consistent.  The final train-prob
+# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280.
+#./show_wer.sh 3z
+#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3z  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_3z_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
new file mode 100755
index 00000000000..c02ad2cb0e4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+# WER is maybe a fraction worse than 3s (see below); final train prob is
+# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213
+#./show_wer.sh 4a
+#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4a  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
new file mode 100755
index 00000000000..aad278c3037
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing)
+#  stopped early after train and valid likelihoods were not promising.
+# [later accidentally overwrote and moved the dir.]
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4b  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
new file mode 100755
index 00000000000..d9060251844
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
@@ -0,0 +1,357 @@
+#!/bin/bash
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
new file mode 100755
index 00000000000..1ae220dc21a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10
+# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to
+# activate the new-style derivative weights.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights true \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
new file mode 100755
index 00000000000..fea5495ee06
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
@@ -0,0 +1,362 @@
+#!/bin/bash
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+# big improvement- about 0.7% WER abs.  Considering the non-l2 part of the objf, the
+# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196].
+# and for the training st it's -0.08820 -> -0.1149.
+
+
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4c
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.0001 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
new file mode 100755
index 00000000000..36d5f188c56
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
@@ -0,0 +1,366 @@
+#!/bin/bash
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
new file mode 100755
index 00000000000..430c6c28c70
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
@@ -0,0 +1,365 @@
+#!/bin/bash
+
+# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000.
+# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse,
+# the train and valid probs both get better.
+
+#                      4a     4c      4g
+#  Final train prob: -0.0879  -0.08820  -0.08784
+#  Final valid prob: -0.1214  -0.1241   -0.1204
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
new file mode 100644
index 00000000000..9125d4e7967
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
@@ -0,0 +1,386 @@
+#!/bin/bash
+
+# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100.
+#  reusing iter 100 of model 4f to avoid some iterations of training [did this by
+# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with
+# --iter 100].
+# [note: to get the block-affine stuff to train fast enough to make a difference
+#  I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in
+#  the code.  That change is not committed.]
+#
+# Essentially no effect on WER, but train and valid probs are worse.
+# ./compare_wer.sh 4f 4n
+# System                       4f        4n
+# WER on train_dev(tg)      16.83     16.84
+# WER on train_dev(fg)      15.73     15.69
+# WER on eval2000(tg)        18.4      18.4
+# WER on eval2000(fg)        16.6      16.6
+# Final train prob      -0.105832 -0.111309
+# Final valid prob      -0.123021 -0.123601
+
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --convert-repeated-to-block-iter 100 \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
new file mode 100755
index 00000000000..d2b073cdc77
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
@@ -0,0 +1,381 @@
+#!/bin/bash
+
+# _4p is as _4f, but one fewer layer, and making the final-layer context wider to
+# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to
+# somewhat compensate for the reduction in parameters.
+
+# definitely worse.  Later with 4r I go in the opposite direction by adding a new layer,
+# and get a small improvement.
+# ./compare_wer.sh 4f 4p
+# System                       4f        4p
+# WER on train_dev(tg)      16.83     17.36
+# WER on train_dev(fg)      15.73     16.10
+# WER on eval2000(tg)        18.4      19.1
+# WER on eval2000(fg)        16.6      17.2
+# Final train prob      -0.105832 -0.104439
+# Final valid prob      -0.123021 -0.125576
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 450  --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
new file mode 100755
index 00000000000..9f2534f4f22
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# this is based on Dan's tdnn_2o script
+# it has a different splicing configuration
+# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer
+
+set -e
+
+#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4q  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=7 
+pool_type='per-dim-weighted-average'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
new file mode 100755
index 00000000000..64831b5802a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
new file mode 100755
index 00000000000..92a1a7da277
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option-
+#currently in a branch]
+# Overall no real change.
+
+# ./compare_wer.sh 4f 4s
+# System                       4f        4s
+# WER on train_dev(tg)      16.83     16.82
+# WER on train_dev(fg)      15.73     15.62
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.111371
+# Final valid prob      -0.123021  -0.12648
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.02 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
new file mode 100755
index 00000000000..30b383d05d7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# [note, I accidentally overwrote this directory afterwards, and moved it.]
+# It's really not clear whether it's helpful.
+# ./compare_wer.sh 4f 4t
+# System                       4f        4t
+# WER on train_dev(tg)      16.83     16.75
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.112721
+# Final valid prob      -0.123021 -0.129688
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
new file mode 100755
index 00000000000..ae7cf02b426
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
@@ -0,0 +1,384 @@
+#!/bin/bash
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+# It seems a bit better on average.
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
new file mode 100755
index 00000000000..9cdbfefb5a2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
@@ -0,0 +1,394 @@
+#!/bin/bash
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+#./compare_wer.sh 4r 4v
+#System                       4r        4v
+#WER on train_dev(tg)      16.50     15.95
+#WER on train_dev(fg)      15.45     14.69
+#WER on eval2000(tg)        18.3      17.7
+#WER on eval2000(fg)        16.7      16.0
+#Final train prob      -0.103652 -0.106646  -1.60775
+#Final valid prob      -0.121105 -0.118631  -1.62832
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
new file mode 100755
index 00000000000..6dd5c587f7a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
@@ -0,0 +1,397 @@
+#!/bin/bash
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a
+# bit worse, although final valid prob is very slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
new file mode 100755
index 00000000000..0290e0bdbd5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
@@ -0,0 +1,396 @@
+#!/bin/bash
+
+# _4x is as _4u, but with --leaky-hmm-coefficient 0.2.   Note: the
+# ultimate baseline is 4f.  It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1).
+# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1.
+#
+# ./compare_wer.sh  4f 4u 4x
+# System                       4f        4u        4x
+# WER on train_dev(tg)      16.83     16.47     16.63
+# WER on train_dev(fg)      15.73     15.23     15.42
+# WER on eval2000(tg)        18.4      18.4      18.4
+# WER on eval2000(fg)        16.6      16.7      16.6
+# Final train prob      -0.105832 -0.118911 -0.130674
+# Final valid prob      -0.123021 -0.135768 -0.146351
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
new file mode 100755
index 00000000000..cd1de07a80d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
@@ -0,0 +1,401 @@
+#!/bin/bash
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.  Very helpful (between 0.2%
+# and 0.6%).
+
+#./compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
new file mode 100755
index 00000000000..7e44c10920e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
@@ -0,0 +1,404 @@
+#!/bin/bash
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
new file mode 100755
index 00000000000..93ebb59b16d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be
+# worse than 0.1.
+# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2).
+#System                       4w        5c
+#WER on train_dev(tg)      16.05     16.35
+#WER on train_dev(fg)      14.92     15.21
+#WER on eval2000(tg)        18.0      17.8
+#WER on eval2000(fg)        16.2      16.4
+#Final train prob      -0.108816 -0.107098
+#Final valid prob      -0.118254 -0.118209
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2.  WER seems consistently
+# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very
+# slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.05 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
new file mode 100755
index 00000000000..8e6e9358003
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
@@ -0,0 +1,407 @@
+#!/bin/bash
+
+# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and
+# jesus-forward-output-dim from 1800 to 2000.
+
+# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1).
+#./compare_wer.sh 5b 5d
+#System                       5b        5d
+#WER on train_dev(tg)      15.51     15.29
+#WER on train_dev(fg)      14.39     14.17
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.7
+#Final train prob      -0.112013 -0.107858
+#Final valid prob      -0.130879 -0.128862
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
new file mode 100755
index 00000000000..ed48b0673b8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
@@ -0,0 +1,417 @@
+#!/bin/bash
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
new file mode 100755
index 00000000000..5fb1f0c445c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+
+# _5f is as _5e, but making the 5b->5d change (increasing the
+# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000,
+# and jesus-forward-input-dim from 500 to 600.
+
+# WER change is (-0.1, -0.2, +0.2, +0.1).  So zero on average.
+# This means 5e remains the best system so far.
+
+#./compare_wer.sh 5e 5f
+#System                       5e        5f
+#WER on train_dev(tg)      15.43     15.35
+#WER on train_dev(fg)      14.32     14.15
+#WER on eval2000(tg)        17.3      17.5
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056  -0.10574
+#Final valid prob      -0.129184 -0.128112
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1).
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/confidence_calibration.sh b/egs/swbd/s5c/local/confidence_calibration.sh
index de330866622..1e2337ab298 100755
--- a/egs/swbd/s5c/local/confidence_calibration.sh
+++ b/egs/swbd/s5c/local/confidence_calibration.sh
@@ -49,9 +49,11 @@ paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unig
 
 
 ###### Train the calibration,
+false && \
 steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \
   $dev_data $graph $word_feats $dev_latdir $dev_caldir
 
+
 ###### Apply the calibration to eval set,
 steps/conf/apply_calibration.sh --cmd "$decode_cmd" \
   $eval_data $graph $eval_latdir $dev_caldir $eval_caldir
diff --git a/egs/swbd/s5c/local/nnet3/run_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_discriminative.sh
new file mode 100755
index 00000000000..3237102a63d
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_discriminative.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+set -e 
+set -o pipefail
+
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+# Note: rather than using any features we have dumped on disk, this script
+# regenerates them from the wav data three times-- when we do lattice
+# generation, numerator alignment and discriminative training.  This made the
+# script easier to write and more generic, because we don't have to know where
+# the features and the iVectors are, but of course it's a little inefficient.
+# The time taken is dominated by the lattice generation anyway, so this isn't
+# a huge deal.
+
+. cmd.sh
+
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+use_gpu=true
+srcdir=exp/nnet3/nnet_ms_a
+criterion=smbr
+drop_frames=false  # only matters for MMI.
+frames_per_eg=150
+frames_overlap_per_eg=30
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+train_stage=-10 # can be used to start training in the middle.
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+num_epochs=4
+degs_dir=
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+lats_dir=
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+one_silence_class=true
+truncate_deriv_weights=10
+minibatch_size=64
+
+adjust_priors=true
+
+determinize=true
+minimize=true
+remove_output_symbols=true
+remove_epsilons=true
+collapse_transition_ids=true
+
+modify_learning_rates=true
+last_layer_factor=1.0
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 1 ]; then
+    nj=50  # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative2.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} || exit 1;
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  use_gpu=no
+  gpu_opts=
+
+  steps/nnet3/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+     --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  # steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+  #    --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
+fi
+
+left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` || exit 1
+right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` || exit 1
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs_n${frames_per_eg}_o${frames_overlap_per_eg}_f
+  if $determinize; then
+    degs_dir=${degs_dir}d
+  fi
+  if $minimize; then
+    degs_dir=${degs_dir}m
+  fi
+  if $remove_output_symbols; then
+    degs_dir=${degs_dir}r
+  fi
+  if $remove_epsilons; then
+    degs_dir=${degs_dir}e
+  fi
+  if $collapse_transition_ids; then
+    degs_dir=${degs_dir}c
+  fi
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize $determinize --minimize $minimize --remove-output-symbols $remove_output_symbols --remove-epsilons $remove_epsilons --collapse-transition-ids $collapse_transition_ids"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir --left-context $left_context --right-context $right_context $frame_subsampling_opt \
+      --criterion $criterion --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir || exit 1;
+
+    # the command below is a more generic, but slower, way to do it.
+    #steps/online/nnet2/get_egs_discriminative2.sh \
+      #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
+      #  --criterion $criterion --drop-frames $drop_frames \
+      #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
+  fi
+fi
+
+d=`basename $degs_dir`
+dir=${srcdir}_${criterion}_${effective_learning_rate}_degs${d##*degs}_ms${minibatch_size}
+
+if $one_silence_class; then
+  dir=${dir}_onesil
+fi
+
+if $modify_learning_rates; then
+  dir=${dir}_modify
+fi
+
+if [ "$last_layer_factor" != "1.0" ]; then
+  dir=${dir}_llf$last_layer_factor
+fi
+
+if [ $stage -le 4 ]; then
+  bash -x steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames $drop_frames \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+      ${degs_dir} $dir || exit 1;
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg_$iter || exit 1;
+      if $has_fisher; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg}_$iter || exit 1;
+      fi
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/swbd/s5c/local/score.sh b/egs/swbd/s5c/local/score.sh
index 81455d1e13a..40a49d0b41a 100755
--- a/egs/swbd/s5c/local/score.sh
+++ b/egs/swbd/s5c/local/score.sh
@@ -13,6 +13,7 @@ stage=0
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
 #end configuration section.
 
diff --git a/egs/swbd/s5c/local/score_basic.sh b/egs/swbd/s5c/local/score_basic.sh
index 5d4fe08426a..0151ae82d33 100755
--- a/egs/swbd/s5c/local/score_basic.sh
+++ b/egs/swbd/s5c/local/score_basic.sh
@@ -6,6 +6,7 @@ cmd=run.pl
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
 #end configuration section.
 
@@ -26,7 +27,7 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
 hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
@@ -45,7 +46,7 @@ function filter_text {
   perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";
      foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
-   '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
+   '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION' '[noise]' '[laughter]' '[vocalized-noise]' '<unk>' '%hesitation'
 }
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh
index 3bce900aecf..7b8a620ac31 100755
--- a/egs/swbd/s5c/local/score_sclite.sh
+++ b/egs/swbd/s5c/local/score_sclite.sh
@@ -7,7 +7,9 @@ stage=0
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
+get_conf=false
 #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
@@ -28,7 +30,7 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
 hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
@@ -62,16 +64,28 @@ mkdir -p $dir/scoring/log
 
 if [ $stage -le 0 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
-    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
-      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
-      lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
-      lattice-1best ark:- ark:- \| \
-      lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm $frame_shift_opt ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt  \| \
-      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-      '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    if ! $get_conf; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+        mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+        lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-1best ark:- ark:- \| \
+        lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \
+        nbest-to-ctm $frame_shift_opt ark:- - \| \
+        utils/int2sym.pl -f 5 $lang/words.txt  \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+        mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+        lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \
+        lattice-to-ctm-conf $frame_shift_opt ark:- - \| \
+        utils/int2sym.pl -f 5 $lang/words.txt  \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    fi
   done
 fi
 
diff --git a/egs/wsj/s5/local/nnet3/run_discriminative.sh b/egs/wsj/s5/local/nnet3/run_discriminative.sh
new file mode 100755
index 00000000000..14ed587ade0
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_discriminative.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+set -e 
+set -o pipefail
+
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+# Note: rather than using any features we have dumped on disk, this script
+# regenerates them from the wav data three times-- when we do lattice
+# generation, numerator alignment and discriminative training.  This made the
+# script easier to write and more generic, because we don't have to know where
+# the features and the iVectors are, but of course it's a little inefficient.
+# The time taken is dominated by the lattice generation anyway, so this isn't
+# a huge deal.
+
+. cmd.sh
+
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+use_gpu=true
+srcdir=exp/nnet3/nnet_ms_a
+criterion=smbr
+drop_frames=false  # only matters for MMI.
+frames_per_eg=150
+frames_overlap_per_eg=30
+effective_learning_rate=0.0000125
+num_jobs_nnet=4
+train_stage=-10 # can be used to start training in the middle.
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+num_epochs=4
+degs_dir=
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+lats_dir=
+train_data_dir=data/train_si284_hires
+online_ivector_dir=exp/nnet3/ivectors_train_si284
+one_silence_class=true
+truncate_deriv_weights=10
+minibatch_size=64
+
+adjust_priors=true
+
+determinize=true
+minimize=true
+remove_output_symbols=true
+remove_epsilons=true
+collapse_transition_ids=true
+
+modify_learning_rates=true
+last_layer_factor=1.0
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 1 ]; then
+    nj=50  # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative2.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode_dnn.config \
+      $train_data_dir data/lang $srcdir $lats_dir || exit 1;
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=100 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  use_gpu=no
+  gpu_opts=
+
+  steps/nnet3/align.sh  --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+     --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali || exit 1;
+
+  # the command below is a more generic, but slower, way to do it.
+  # steps/online/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu "$use_gpu" \
+  #    --nj $nj data/train_960 data/lang ${srcdir}_online ${srcdir}_ali || exit 1;
+fi
+
+left_context=14
+right_context=10
+
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs_n${frames_per_eg}_o${frames_overlap_per_eg}_f
+  if $determinize; then
+    degs_dir=${degs_dir}d
+  fi
+  if $minimize; then
+    degs_dir=${degs_dir}m
+  fi
+  if $remove_output_symbols; then
+    degs_dir=${degs_dir}r
+  fi
+  if $remove_epsilons; then
+    degs_dir=${degs_dir}e
+  fi
+  if $collapse_transition_ids; then
+    degs_dir=${degs_dir}c
+  fi
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize $determinize --minimize $minimize --remove-output-symbols $remove_output_symbols --remove-epsilons $remove_epsilons --collapse-transition-ids $collapse_transition_ids"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --online-ivector-dir $online_ivector_dir --left-context $left_context --right-context $right_context \
+      --criterion $criterion --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir || exit 1;
+
+    # the command below is a more generic, but slower, way to do it.
+    #steps/online/nnet2/get_egs_discriminative2.sh \
+      #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
+      #  --criterion $criterion --drop-frames $drop_frames \
+      #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
+  fi
+fi
+
+d=`basename $degs_dir`
+dir=${srcdir}_${criterion}_${effective_learning_rate}_degs${d##*degs}_ms${minibatch_size}
+
+if $one_silence_class; then
+  dir=${dir}_onesil
+fi
+
+if $modify_learning_rates; then
+  dir=${dir}_modify
+fi
+
+if [ "$last_layer_factor" != "1.0" ]; then
+  dir=${dir}_llf$last_layer_factor
+fi
+
+if [ $truncate_deriv_weights -ne 0 ]; then
+  dir=${dir}_tr${truncate_deriv_weights}
+fi
+
+if [ $stage -le 4 ]; then
+  bash -x steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate \
+    --criterion $criterion --drop-frames $drop_frames \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+      ${degs_dir} $dir || exit 1;
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+
+if [ $stage -le 5 ]; then
+  # this does offline decoding that should give the same results as the real
+  # online decoding.
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    iter=epoch$x.adj
+    for lm_suffix in tgpr bd_tgpr; do
+      graph_dir=exp/tri4b/graph_${lm_suffix}
+      # use already-built graphs.
+      for year in eval92 dev93; do
+        steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_test_$year --iter $iter \
+          $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_iter$iter || exit 1;
+      done
+    done
+  done
+fi
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh
index 33be80d85b2..d6dd371035f 100755
--- a/egs/wsj/s5/steps/conf/apply_calibration.sh
+++ b/egs/wsj/s5/steps/conf/apply_calibration.sh
@@ -49,6 +49,12 @@ cp $calibration $dir/calibration.mdl
 cp $word_feats $dir/word_feats
 cp $word_categories $dir/word_categories
 
+if [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 # Create the ctm with raw confidences,
 # - we keep the timing relative to the utterance,
 if [ $stage -le 0 ]; then
@@ -58,7 +64,7 @@ if [ $stage -le 0 ]; then
     lattice-push --push-strings=false ark:- ark:- \| \
     lattice-align-words-lexicon --max-expand=10.0 \
      $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-    lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
+    lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- - \| \
     utils/int2sym.pl -f 5 $lang/words.txt \
     '>' $dir/JOB.ctm
   # Merge and clean,
@@ -75,8 +81,8 @@ fi
 
 # Create the forwarding data for logistic regression,
 if [ $stage -le 2 ]; then
-  steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \
-    $dir/ctm_int $word_feats $latdepth $word_categories
+  python steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \
+    --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories
 fi
 
 # Apply calibration model to dev,
diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
index 709f60b8ad6..3ccdf6fb164 100755
--- a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
+++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
@@ -1,4 +1,4 @@
-#!/bin/env python
+#!/usr/bin/env python
 
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
index 003d77c5e8a..23db9633a1c 100755
--- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py
+++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
@@ -13,8 +13,8 @@
 The logisitc-regression input features are: 
 - posteriors from 'ctm' transformed by logit,
 - logarithm of word-length in letters,
-- logarithm of average lattice-depth at position of the word,
 - 10base logarithm of unigram probability of a word from language model,
+- logarithm of average lattice-depth at position of the word (optional),
 
 The logistic-regresion targets are:
 - 1 for correct word,
@@ -33,12 +33,13 @@
 parser = OptionParser(usage=usage, description=desc)
 parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
 parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
+parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
 (o, args) = parser.parse_args()
 
-if len(args) != 4:
+if len(args) != 3:
   parser.print_help()
   sys.exit(1)
-ctm_file, word_feats_file, depths_file, word_categories_file = args
+ctm_file, word_feats_file, word_categories_file = args
 
 assert(o.conf_feats != '')
 
@@ -76,10 +77,12 @@
 
 # Load the per-frame lattice-depth,
 # - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
-depths = dict()
-for l in open(depths_file):
-  utt,d = l.split(' ',1)
-  depths[utt] = map(int,d.split())
+# - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
+if o.lattice_depth:
+  depths = dict()
+  for l in open(o.lattice_depth):
+    utt,d = l.split(' ',1)
+    depths[utt] = map(int,d.split())
 
 # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
 wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
@@ -98,15 +101,19 @@
     logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
     # - log of word-length,
     log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
-    # - log of average-depth of lattice at the word position,
-    depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
-    log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
     # - categorical distribution of words (with frequency higher than min-count),
     wrd_1_of_k = [0]*wrd_cat_num; 
     wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;
 
     # Compose the input feature vector,
-    feats = [ logit, log_word_length, log_avg_depth, other_feats[wrd_id] ] + wrd_1_of_k
+    feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k
+
+    # Optionally add average-depth of lattice at the word position,
+    if o.lattice_depth != '':
+      depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
+      log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
+      feats += [ log_avg_depth ]
+
     # Store the input features, 
     f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n')
 
diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh
index 64ca70022c8..c5d2082ab90 100755
--- a/egs/wsj/s5/steps/conf/train_calibration.sh
+++ b/egs/wsj/s5/steps/conf/train_calibration.sh
@@ -76,22 +76,22 @@ fi
 
 # Get evaluation of the 'ctm' using the 'text' reference,
 if [ $stage -le 1 ]; then
-  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
+  python steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
   align-text --special-symbol="<eps>" ark:$data/text ark:- ark,t:- | \
   utils/scoring/wer_per_utt_details.pl --special-symbol "<eps>" \
   >$dir/align_text 
   # Append alignment to ctm,
-  steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned
+  python steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned
   # Convert words to 'ids',
   cat $dir/ctm_aligned | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_aligned_int
 fi
 
 # Prepare word-categories (based on wotd frequencies in 'ctm'),
 if [ -z "$category_text" ]; then
-  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
-  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories
+  python steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
+  python steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories
 else
-  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories
+  python steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories
 fi
 
 # Compute lattice-depth,
@@ -102,9 +102,9 @@ fi
 
 # Create the training data for logistic regression,
 if [ $stage -le 3 ]; then
-  steps/conf/prepare_calibration_data.py \
+  python steps/conf/prepare_calibration_data.py \
     --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \
-    $dir/ctm_aligned_int $word_feats $latdepth $dir/word_categories
+    --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories
 fi
 
 # Train the logistic regression,
diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh
index 753411f4563..e4e726522f0 100755
--- a/egs/wsj/s5/steps/nnet2/decode.sh
+++ b/egs/wsj/s5/steps/nnet2/decode.sh
@@ -151,7 +151,7 @@ if [ $stage -le 2 ]; then
     [ ! -x local/score.sh ] && \
       echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
     echo "score best paths"
-    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    local/score.sh --iter $iter $scoring_opts --cmd "$cmd" $data $graphdir $dir
     echo "score confidence and timing with sclite"
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
index 4c08a08b824..b1c145b6157 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
@@ -28,6 +28,15 @@ online_ivector_dir=
 num_utts_subset=3000
 num_archives_priors=10
 
+left_context=
+right_context=
+
+collapse_transition_ids=true
+determinize=true
+minimize=true
+split=true
+excise=true
+
 # End configuration section.
 
 
@@ -248,8 +257,17 @@ if [ -d $dir/storage ]; then
 fi
 
 rm $dir/.error 2>/dev/null
-left_context=$(nnet-am-info $dir/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1
-right_context=$(nnet-am-info $dir/final.mdl | grep '^right-context' | awk '{print $2}') || exit 1
+if [ -z "$left_context" ]; then
+  left_context=$(nnet-am-info $dir/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1
+fi
+if [ -z "$right_context" ]; then
+  right_context=$(nnet-am-info $dir/final.mdl | grep '^right-context' | awk '{print $2}') || exit 1
+fi
+
+nnet_context_opts="--left-context=$left_context --right-context=$right_context"
+
+echo "left-context=$left_context"
+echo "right-context=$right_context"
 
 (
 
@@ -261,8 +279,6 @@ for y in `seq $num_archives_priors`; do
   priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
 done
 
-nnet_context_opts="--left-context=$left_context --right-context=$right_context"
-
 echo "$0: dumping egs for prior adjustment in the background."
 
 $cmd $dir/log/create_priors_subset.log \
@@ -279,13 +295,15 @@ fi
 
 ) &
 
+discriminative_egs_opts="--determinize=$determinize --minimize=$minimize --collapse-transition-ids=$collapse_transition_ids --split=$split --excise=$excise"
+
 if [ $stage -le 3 ]; then
   echo "$0: getting initial training examples by splitting lattices"
 
   degs_list=$(for n in $(seq $num_archives_temp); do echo ark:$dir/degs_orig.JOB.$n.ark; done)
 
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames \
+    nnet-get-egs-discriminative --criterion=$criterion --drop-frames=$drop_frames $nnet_context_opts $discriminative_egs_opts \
       "$src_model" "$feats" "$ali_rspecifier" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz|" ark:- \| \
     nnet-copy-egs-discriminative $const_dim_opt ark:- $degs_list || exit 1;
   sleep 5;  # wait a bit so NFS has time to write files.
diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
index c840e014250..7bd4ecf5647 100755
--- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh
+++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
@@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do
   echo >> $dir/indexes
   num_blocks=$[$num_blocks+1]
   cur_index=$[$cur_index+$block_shift]
-  if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then
+  if [ $[$cur_index+$block_size] -gt $feat_dim ]; then
     cur_index=$[$feat_dim-$block_size];
   fi
 done
diff --git a/egs/wsj/s5/steps/nnet2/train_discriminative2.sh b/egs/wsj/s5/steps/nnet2/train_discriminative2.sh
index 85047efc581..eb719838f36 100755
--- a/egs/wsj/s5/steps/nnet2/train_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative2.sh
@@ -98,7 +98,7 @@ dir=$2
 [ -z "$src_model" ] && src_model=$degs_dir/final.mdl
 
 # Check some files.
-for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_archive} $src_model; do
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl} $src_model; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
new file mode 100755
index 00000000000..d75eef0536d
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+. path.sh
+
+cmd=run.pl
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+use_gpu=false             # if true, we run on GPU.
+egs_type=egs
+raw=false
+use_degs=false
+iter=final
+
+. utils/parse_options.sh
+
+if $use_degs && [ $egs_type == egs ]; then
+  egs_type=degs
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [opts] <exp-dir> <egs-dir>"
+  echo " e.g.: $0 exp/nnet3_sad_snr/tdnn_train_100k_whole_1k_splice2_2_relu500"
+  exit 1
+fi
+
+dir=$1
+egs_dir=$2
+
+if $use_gpu; then
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+else
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do 
+  if [ ! -f $f ]; then
+    echo "$f not found" 
+    exit 1 
+  fi
+done
+
+if $raw; then
+  model=$dir/$iter.raw
+else 
+  model="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
+fi
+
+rm -f $dir/post.$iter.*.vec 2>/dev/null
+
+left_context=`cat $egs_dir/info/left_context` || exit 1
+right_context=`cat $egs_dir/info/right_context` || exit 1
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+else egs_part=JOB; fi
+
+if ! $use_degs; then
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
+    nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+    "$model" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
+else 
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
+    nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-discriminative-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \
+    "$model" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
+
+fi
+
+sleep 3;  # make sure there is time for $dir/post.$iter.*.vec to appear.
+
+$cmd $dir/log/vector_sum.$iter.log \
+  vector-sum $dir/post.$iter.*.vec $dir/post.$iter.vec || exit 1;
+
+if ! $raw; then
+  run.pl $dir/log/adjust_priors.$iter.log \
+    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl
+fi
+
+rm -f $dir/post.$iter.*.vec;
+
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 71947961da4..22d2a618ed7 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -118,9 +118,16 @@ echo "$0: aligning data in $data using model from $srcdir, putting alignments in
 
 tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
 
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  cp $srcdir/frame_subsampling_factor $dir
+fi
+
 $cmd JOB=1:$nj $dir/log/align.JOB.log \
   compile-train-graphs $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
-  nnet3-align-compiled $scale_opts $ivector_opts \
+  nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
     --use-gpu=$use_gpu --beam=$beam --retry-beam=$retry_beam \
     $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index fc75932d0d3..6eb0f51308b 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -25,6 +25,8 @@ frames_per_eg=25   # number of frames of labels per example.  more->less disk sp
 frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
+cut_zero_frames=-1  # if activated, activates new-style derivative weights.. i'll reorganize
+                    # this if it works well.
 frame_subsampling_factor=3 # ratio between input and output frame-rate of nnet.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
@@ -44,7 +46,9 @@ num_egs_diagnostic=400 # number of frames for "compute_prob" jobs
 frames_per_iter=400000 # each iteration of training, see this many frames
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
+
 right_tolerance=  #CTC right tolerance == max label delay.
+left_tolerance=  
 
 transform_dir=     # If supplied, overrides latdir as the place to find fMLLR transforms
 
@@ -263,7 +267,7 @@ if [ $stage -le 2 ]; then
 fi
 
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames"
 
 
 [ -z $valid_left_context ] &&  valid_left_context=$left_context;
@@ -275,6 +279,8 @@ ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$frame
 [ ! -z $right_tolerance ] && \
   ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance"
 
+[ ! -z $left_tolerance ] && \
+  ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 1a62d8d7bb6..f2af7d0fdcb 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -23,17 +23,27 @@ truncate_deriv_weights=0  # can be used to set to zero the weights of derivs fro
 apply_deriv_weights=true
 initial_effective_lrate=0.0002
 final_effective_lrate=0.00002
+extra_left_context=0  # actually for recurrent setups.
 pnorm_input_dim=3000
 pnorm_output_dim=300
 relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+
+jesus_opts=  # opts to steps/nnet3/make_jesus_configs.py.
+             # If nonempty, assumes you want to use the jesus nonlinearity,
+             # and you should supply various options to that script in
+             # this string.
 rand_prune=4.0 # Relates to a speedup we do for LDA.
 minibatch_size=512  # This default is suitable for GPU-based training.
                     # Set it to 128 for multi-threaded CPU-based training.
 lm_opts=   # options to chain-est-phone-lm
+l2_regularize=0.0
+leaky_hmm_coefficient=0.00001
+xent_regularize=0.0
 frames_per_iter=800000  # each iteration of training, see this many [input]
                         # frames per job.  This option is passed to get_egs.sh.
                         # Aim for about a minute of training time
 right_tolerance=10
+left_tolerance=5
 denominator_scale=1.0 # relates to tombsone stuff.
 num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
@@ -66,6 +76,10 @@ exit_stage=-100 # you can set this to terminate the training early.  Exits befor
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
 splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+pool_type='none'
+pool_window=
+pool_lpfilter_width=
+
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
@@ -87,7 +101,7 @@ right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on th
 
 # End configuration section.
 
-trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -197,23 +211,43 @@ num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exi
 
 if [ $stage -le -5 ]; then
   echo "$0: creating neural net configs";
-  if [ ! -z "$relu_dim" ]; then
-    dim_opts="--relu-dim $relu_dim"
+
+  if [ ! -z "$jesus_opts" ]; then
+    python steps/nnet3/make_jesus_configs.py \
+      --xent-regularize=$xent_regularize \
+      --include-log-softmax=false \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+       $jesus_opts \
+      --num-targets $num_leaves \
+      $dir/configs || exit 1;
   else
-    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
-  fi
+    [ $xent_regularize != "0.0" ] && \
+      echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1;
+    if [ ! -z "$relu_dim" ]; then
+      dim_opts="--relu-dim $relu_dim"
+    else
+      dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+    fi
 
-  # create the config files for nnet initialization
-  python steps/nnet3/make_tdnn_configs.py \
-    --include-log-softmax=false \
-    --final-layer-normalize-target $final_layer_normalize_target \
-    --splice-indexes "$splice_indexes"  \
-    --feat-dim $feat_dim \
-    --ivector-dim $ivector_dim  \
-     $dim_opts \
-    --num-targets $num_leaves \
-    --use-presoftmax-prior-scale false \
-   $dir/configs || exit 1;
+    # create the config files for nnet initialization
+    pool_opts=
+    pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+    pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+    pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+
+    python steps/nnet3/tdnn/make_configs.py $pool_opts \
+      --include-log-softmax=false \
+      --final-layer-normalize-target $final_layer_normalize_target \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+      $dim_opts \
+      --num-targets $num_leaves \
+      --use-presoftmax-prior-scale false \
+      $dir/configs || exit 1;
+  fi
 
   # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
   # matrix.  This first config just does any initial splicing that we do;
@@ -242,12 +276,14 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--transform-dir $transform_dir)
   # we need a bit of extra left-context and right-context to allow for frame
   # shifts (we use shifted version of the data for more variety).
-  extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2])
+  extra_opts+=(--left-context $[$left_context+$frame_subsampling_factor/2+$extra_left_context])
   extra_opts+=(--right-context $[$right_context+$frame_subsampling_factor/2])
   echo "$0: calling get_egs.sh"
   steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \
       --frames-per-iter $frames_per_iter --stage $get_egs_stage \
       --cmd "$cmd" \
+      --right-tolerance "$right_tolerance" \
+      --left-tolerance "$left_tolerance" \
       --frames-per-eg $frames_per_eg \
       --frame-subsampling-factor $frame_subsampling_factor \
       $data $dir $latdir $dir/egs || exit 1;
@@ -414,11 +450,11 @@ while [ $x -lt $num_iters ]; do
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
-      nnet3-chain-compute-prob  \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
           "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
           "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
     $cmd $dir/log/compute_prob_train.$x.log \
-      nnet3-chain-compute-prob \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient  --xent-regularize=$xent_regularize \
           "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
           "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
 
@@ -461,7 +497,9 @@ while [ $x -lt $num_iters ]; do
     rm $dir/.error 2>/dev/null
 
 
-    ( # this sub-shell is so that when we "wait" below,
+    (
+      trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
+      # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
 
@@ -476,6 +514,7 @@ while [ $x -lt $num_iters ]; do
 
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \
+             --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
               $parallel_train_opts $deriv_time_opts \
              --max-param-change=$this_max_param_change \
             --print-interval=10 "$mdl" $dir/den.fst \
@@ -543,7 +582,7 @@ if [ $stage -le $num_iters ]; then
   # num-threads to 8 to speed it up (this isn't ideal...)
 
   $cmd $combine_queue_opt $dir/log/combine.log \
-    nnet3-chain-combine --num-iters=40 \
+    nnet3-chain-combine --num-iters=40  --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \
        --enforce-sum-to-one=true --enforce-positive-weights=true \
        --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \
        "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1;
@@ -553,11 +592,11 @@ if [ $stage -le $num_iters ]; then
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
   $cmd $dir/log/compute_prob_valid.final.log \
-    nnet3-chain-compute-prob \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
            "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
     "ark:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
   $cmd $dir/log/compute_prob_train.final.log \
-    nnet3-chain-compute-prob \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
       "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
     "ark:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
 fi
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 87323a1c3e1..1fc49290dfe 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -6,6 +6,45 @@
 import sys
 import warnings
 import copy
+from operator import itemgetter
+import numpy as np
+try:
+    import scipy.signal as signal
+    has_scipy_signal = True
+except ImportError:
+    has_scipy_signal = False
+
+def WriteKaldiMatrix(matrix, matrix_file_name):
+    assert(len(matrix.shape) == 2)
+    # matrix is a numpy array
+    matrix_file = open(matrix_file_name, "w")
+    [rows, cols ] = matrix.shape
+    matrix_file.write('[\n')
+    for row in range(rows):
+        matrix_file.write(' '.join( map(lambda x: '{0:f}'.format(x), matrix[row, : ])))
+        if row == rows - 1:
+            matrix_file.write("]")
+        else:
+            matrix_file.write('\n')
+    matrix_file.close()
+def GetSumDescriptor(inputs):
+    sum_descriptors = inputs
+    while len(sum_descriptors) != 1:
+        cur_sum_descriptors = []
+        pair = []
+        while len(sum_descriptors) > 0:
+            value = sum_descriptors.pop()
+            if value.strip() != '':
+                pair.append(value)
+            if len(pair) == 2:
+                cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1]))
+                pair = []
+        if pair:
+            cur_sum_descriptors.append(pair[0])
+        sum_descriptors = cur_sum_descriptors
+    return sum_descriptors
+
+
 
 # adds the input nodes and returns the descriptor
 def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
@@ -19,11 +58,26 @@ def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
         components.append('input-node name=ivector dim=' + str(ivector_dim))
         list.append('ReplaceIndex(ivector, t, 0)')
         output_dim += ivector_dim
-    splice_descriptor = "Append({0})".format(", ".join(list))
+    if len(list) > 1:
+        splice_descriptor = "Append({0})".format(", ".join(list))
+    else:
+        splice_descriptor = list[0]
     print(splice_descriptor)
     return {'descriptor': splice_descriptor,
             'dimension': output_dim}
 
+def AddNoOpLayer(config_lines, name, input):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
+    component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_noop'.format(name),
+            'dimension': input['dimension']}
+
+
+
 def AddLdaLayer(config_lines, name, input, lda_file):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -34,6 +88,30 @@ def AddLdaLayer(config_lines, name, input, lda_file):
     return {'descriptor':  '{0}_lda'.format(name),
             'dimension': input['dimension']}
 
+def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    assert((input['dimension'] % num_blocks == 0) and
+            (output_dim % num_blocks == 0))
+    components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
+    component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor' : '{0}_block_affine'.format(name),
+                           'dimension' : output_dim}
+
+
+def AddPermuteLayer(config_lines, name, input, column_map):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    permute_indexes = ",".join(map(lambda x: str(x), column_map))
+    components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
+    component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor': '{0}_permute'.format(name),
+            'dimension': input['dimension']}
+
+
+
 def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
@@ -44,13 +122,13 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""
     return {'descriptor':  '{0}_affine'.format(name),
             'dimension': output_dim}
 
-def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
+def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
 
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options))
     components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim))
-    components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim))
+    components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))
 
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
     component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
@@ -60,6 +138,36 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
             'dimension': output_dim}
 
 
+def AddConvolutionLayer(config_lines, name, input,
+                       input_x_dim, input_y_dim, input_z_dim,
+                       filt_x_dim, filt_y_dim,
+                       filt_x_step, filt_y_step,
+                       num_filters, input_vectorization,
+                       param_stddev = None, bias_stddev = None,
+                       filter_bias_file = None,
+                       is_updatable = True):
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    conv_init_string = "component name={0}_conv type=ConvolutionComponent input-x-dim={1} input-y-dim={2} input-z-dim={3} filt-x-dim={4} filt-y-dim={5} filt-x-step={6} filt-y-step={7} input-vectorization-order={8}".format(name, input_x_dim, input_y_dim, input_z_dim, filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, input_vectorization)
+    if filter_bias_file is not None:
+        conv_init_string += " matrix={0}".format(filter_bias_file)
+    if is_updatable:
+        conv_init_string += " is-updatable=true"
+    else:
+        conv_init_string += " is-updatable=false"
+
+    components.append(conv_init_string)
+    component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
+
+    num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step)
+    num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step)
+    output_dim = num_x_steps * num_y_steps * num_filters;
+    return {'descriptor':  '{0}_conv_t'.format(name),
+            'dimension': output_dim}
+
+
 
 def AddSoftmaxLayer(config_lines, name, input):
     components = config_lines['components']
@@ -72,7 +180,7 @@ def AddSoftmaxLayer(config_lines, name, input):
             'dimension': input['dimension']}
 
 
-def AddOutputNode(config_lines, input, label_delay=None):
+def AddOutputLayer(config_lines, input, label_delay=None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
     if label_delay is None:
@@ -80,12 +188,18 @@ def AddOutputNode(config_lines, input, label_delay=None):
     else:
         component_nodes.append('output-node name=output input=Offset({0},{1})'.format(input['descriptor'], label_delay))
 
-def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = "", label_delay=None, include_softmax = "true"):
+def AddFinalLayer(config_lines, input, output_dim, ng_affine_options = " param-stddev=0 bias-stddev=0 ", label_delay=None, use_presoftmax_prior_scale = False, prior_scale_file = None, include_log_softmax = True):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    
     prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim, ng_affine_options)
-    if include_softmax == "true":
-      prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
-    AddOutputNode(config_lines, prev_layer_output, label_delay)
-
+    if include_log_softmax:
+        if use_presoftmax_prior_scale :
+            components.append('component name=Final-fixed-scale type=FixedScaleComponent scales={0}'.format(prior_scale_file))
+            component_nodes.append('component-node name=Final-fixed-scale component=Final-fixed-scale input={0}'.format(prev_layer_output['descriptor']))
+            prev_layer_output['descriptor'] = "Final-fixed-scale"
+        prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
+    AddOutputLayer(config_lines, prev_layer_output, label_delay)
 
 def AddLstmLayer(config_lines,
                  name, input, cell_dim,
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index f4de09740ae..1a60118c67c 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -26,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 parallel_opts=  # ignored now.
 scoring_opts=
 skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
 feat_type=
 online_ivector_dir=
 minimize=false
@@ -132,11 +136,12 @@ if [ ! -z "$online_ivector_dir" ]; then
 fi
 
 if [ "$post_decode_acwt" == 1.0 ]; then
-  lat_wspecifier="ark|gzip -c >$dir/lat.JOB.gz"
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
 else
   lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
 fi
 
+frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
   frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
@@ -146,6 +151,10 @@ if [ $stage -le 1 ]; then
   $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
      --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
      --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
      --word-symbol-table=$graphdir/words.txt "$model" \
@@ -161,7 +170,7 @@ if [ $stage -le 2 ]; then
     [ ! -x local/score.sh ] && \
       echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
     echo "score best paths"
-    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    local/score.sh --iter $iter $scoring_opts --cmd "$cmd" $data $graphdir $dir
     echo "score confidence and timing with sclite"
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index 88cf54e824e..2290c4d2e7f 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -34,6 +34,11 @@
         'shape':'box',
         'style':'filled'
     },
+    'ConvolutionComponent':{
+        'color':'lightpink',
+        'shape':'box',
+        'style':'filled'
+    },
     'FixedScaleComponent':{
         'color':'blueviolet',
         'shape':'box',
@@ -64,6 +69,11 @@
         'shape':'rectangle',
         'style':'filled'
     },
+    'ClipGradientComponent':{
+        'color':'bisque',
+        'shape':'rectangle',
+        'style':'filled'
+    },
     'ElementwiseProductComponent':{
         'color':'green',
         'shape':'rectangle',
@@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False):
     #   2. Nnet3 names can be shared among components and component nodes
     #      dot does not allow common names
     #
-    name_string = re.sub("-", "hyphen", name_string)
+    node_name_string = re.sub("-", "hyphen", name_string)
     if is_component:
-        name_string += name_string.strip() + "_component"
-    return name_string
+        node_name_string += node_name_string.strip() + "_component"
+    return {"label":name_string, "node":node_name_string}
 
 def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
@@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
         dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
 
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = "{{"+label+"}|Append}"
-    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     if attr_string != '':
         dot_string += ' [{0}] '.format(attr_string)
@@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
 
     return dot_graph
 
+def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None):
+    dot_graph = []
+
+    label = 'Round ({0})'.format(segment['arguments'][1])
+    style = None
+    if edge_attributes is not None:
+        if edge_attributes.has_key('label'):
+            label = "{0} {1}".format(edge_attributes['label'], label)
+        if edge_attributes.has_key('style'):
+            style  = 'style={0}'.format(edge_attributes['style'])
+
+    attr_string = 'label="{0}"'.format(label)
+    if style is not None:
+        attr_string += ' {0}'.format(style)
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
+                                                                    attr_string))
+    if segment['sub_segments']:
+        raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed")
+    return dot_graph
+
+
 def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
 
@@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes =
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed")
@@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
     dot_graph = []
     names = []
     desc_name = 'Sum_{0}'.format(affix)
+    # create the sum node
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
-        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
+        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i))
 
+    # link the sum node parts to corresponding segments
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = '{{'+label+'}|Sum}'
-    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     dot_string += ' [{0} tailport=s ] '.format(attr_string)
     dot_graph.append(dot_string)
@@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed")
@@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes
         dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'})
 
     if segment['arguments']:
-        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name)))
+        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node']))
 
     return dot_graph
 
@@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N
         dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes)
     elif segment['name'] == "ReplaceIndex":
         dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes)
+    elif segment['name'] == "Round":
+        dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes)
     else:
         raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name']))
     return dot_graph
@@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name):
             dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name)
     elif arguments:
         assert(len(arguments) == 1)
-        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name)))
+        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node']))
     return dot_lines
 
 def ParseNnet3String(string):
@@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None):
     except KeyError:
         pass
 
-    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)]
+    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)]
 
 
 # input-node name=input dim=40
 def Nnet3InputToDot(parsed_config):
-    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )]
+    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )]
 
 # output-node name=output input=Final_log_softmax dim=3940 objective=linear
+#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear
 def Nnet3OutputToDot(parsed_config):
     dot_graph = []
-    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective']))
-    dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name'])))
+    dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective']))
     return dot_graph
 
 # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256
 def Nnet3DimrangeToDot(parsed_config):
     dot_graph = []
-    dot_graph.append(parsed_config['name'])
-    dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name'])))
-    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']),
-                                                           GetDotNodeName(parsed_config['name']),
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'],
+                                                           GetDotNodeName(parsed_config['name'])['node'],
                                                            parsed_config['dim-offset'],
                                                            parsed_config['dim']))
     return dot_graph
@@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config):
 def Nnet3ComponentNodeToDot(parsed_config):
     dot_graph = []
     dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
-    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name']))
-    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True),
-                                                       GetDotNodeName(parsed_config['name'])))
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'],
+                                                       GetDotNodeName(parsed_config['name'])['node']))
     return dot_graph
 
 def GroupConfigs(configs, node_prefixes = []):
@@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
                         " will be clustered together in the dot-graph"
                         " --node-prefixes Lstm1,Lstm2,Layer1", default=None)
 
+    parser.add_argument("dotfile", help="name of the dot output file")
+
     print(' '.join(sys.argv), file=sys.stderr)
 
     args = parser.parse_args()
@@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
 
     lines = sys.stdin.readlines()
     dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes)
-    print("\n".join(dot_graph))
+
+    dotfile_handle = open(args.dotfile, "w")
+    dotfile_handle.write("\n".join(dot_graph))
+    dotfile_handle.close()
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index dc8cac9c0b0..364f6a72443 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -170,8 +170,8 @@ esac
 
 if [ -f $dir/trans.scp ]; then
   feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
-  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
-  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
 fi
 
 if [ ! -z "$online_ivector_dir" ]; then
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
new file mode 100755
index 00000000000..2112b0ba227
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -0,0 +1,469 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2014-2015   Vimal Manohar
+
+# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
+# training of neural nets.  
+# Criterion supported are mpe, smbr and mmi
+
+# Begin configuration section.
+cmd=run.pl
+feat_type=raw     # set it to 'lda' to use LDA features.
+frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
+                  # less time preparing egs, but more I/O during training.
+                  # note: the script may reduce this if reduce_frames_per_eg is true.
+frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
+                           # this should be read from the nnet. For now, it is taken as an option
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
+adjust_priors=true
+priors_left_context=   # amount of left_context for priors egs
+priors_right_context=   # amount of right_context for priors egs
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+num_utts_subset=80     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+
+frames_per_iter=400000 # each iteration of training, see this many frames
+                       # per job.  This is just a guideline; it will pick a number
+                       # that divides the number of samples in the entire data.
+
+determinize=false
+minimize=false
+remove_output_symbols=false
+remove_epsilons=false
+collapse_transition_ids=false
+acwt=0.1
+
+criterion=smbr
+
+stage=0
+#nj=15         # This should be set to the maximum number of jobs you are
+#              # comfortable to run in parallel; you can increase it if your disk
+#              # speed is greater and you have more machines.
+max_shuffle_jobs_run=50 
+
+transform_dir= # If this is a SAT system, directory for transforms
+online_ivector_dir=
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+num_priors_subset=100
+num_archives_priors=10
+
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup.  (but you may want to use"
+  echo "                                                   # steps/online/nnet2/get_egs_discriminative2.sh instead)"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+denlatdir=$4
+src_model=$5
+dir=$6
+
+extra_files=
+[ ! -z $online_ivector_dir ] && \
+  extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
+         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+[ "$(readlink /bin/sh)" == dash ] && \
+  echo "This script won't work if /bin/sh points to dash.  make it point to bash." && exit 1
+
+nj=$(cat $denlatdir/num_jobs) || exit 1;
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+# Get list of validation utterances.
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+    > $dir/valid_uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+  
+if [ $stage -le 1 ]; then
+  nj_ali=$(cat $alidir/num_jobs)
+  all_ids=$(seq -s, $nj_ali)
+  $cmd $dir/log/copy_alignments.log \
+    copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \
+    ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
+
+prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |"
+
+if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+cp $alidir/tree $dir
+cp $lang/phones/silence.csl $dir/info/
+cp $src_model $dir/final.mdl || exit 1
+
+# Get list of utterances for prior computation.
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+  utils/shuffle_list.pl | head -$num_priors_subset \
+  > $dir/priors_uttlist || exit 1;
+
+## We don't support deltas here, only LDA or raw (mainly because deltas are less
+## frequently used).
+if [ -z $feat_type ]; then
+  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    echo $cmvn_opts > $dir/cmvn_opts
+   ;;
+  lda)
+    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+    cp $alidir/splice_opts $dir 2>/dev/null
+    cp $alidir/final.mat $dir
+    [ ! -z "$cmvn_opts" ] && \
+       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
+    cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+    cp $alidir/cmvn_opts $dir 2>/dev/null
+    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+fi
+
+if [ ! -z $online_ivector_dir ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period)
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim >$dir/info/ivector_dim
+
+  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s/JOB/1/g)"
+  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# Working out total number of archives. Add one on the assumption the
+# num-frames won't divide exactly, and we want to round up.
+num_archives=$[$num_frames/$frames_per_iter+1]
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+! [ $egs_per_archive -le $frames_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.$x.ark; done)
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: copying training lattices"
+
+  $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \
+    lattice-copy --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \
+    "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1;
+
+  for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
+fi
+
+splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
+
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+
+[ -z $priors_left_context ] &&  priors_left_context=$left_context;
+[ -z $priors_right_context ] &&  priors_right_context=$right_context;
+
+left_context=$[left_context+frame_subsampling_factor/2]
+right_context=$[right_context+frame_subsampling_factor/2]
+
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+
+valid_left_context=$[valid_left_context+frame_subsampling_factor/2]
+valid_right_context=$[valid_right_context+frame_subsampling_factor/2]
+
+# don't do the overlap thing for the validation data.
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+
+priors_left_context=$[priors_left_context+frame_subsampling_factor/2]
+priors_right_context=$[priors_right_context+frame_subsampling_factor/2]
+
+# don't do the overlap thing for the priors computation data.
+priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress"
+
+supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+
+echo $priors_left_context > $dir/info/priors_left_context
+echo $priors_right_context > $dir/info/priors_right_context
+
+echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
+
+(
+
+if $adjust_priors && [ $stage -le 10 ]; then
+  
+if [ ! -f $dir/ali.scp ]; then
+  nj_ali=$(cat $alidir/num_jobs)
+  all_ids=$(seq -s, $nj_ali)
+  $cmd $dir/log/copy_alignments.log \
+    copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \
+    ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
+
+priors_egs_list=
+for y in `seq $num_archives_priors`; do
+  utils/create_data_link.pl $dir/priors_egs.$y.ark
+  priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
+done
+
+echo "$0: dumping egs for prior adjustment in the background."
+
+num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1
+
+$cmd $dir/log/create_priors_subset.log \
+  nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \
+  "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
+  ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \
+  { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }
+
+sleep 3;
+
+echo $num_archives_priors >$dir/info/num_archives_priors
+
+else
+
+echo 0 > $dir/info/num_archives_priors
+
+fi
+
+) &
+
+if [ $stage -le 4 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm -f $dir/.error 2>/dev/null || true
+  echo "$0: ... extracting validation and training-subset alignments."
+
+  #utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+  #  <$dir/lat.scp >$dir/lat_special.scp
+
+  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+    <$dir/ali.scp >$dir/ali_special.scp
+
+  $cmd $dir/log/create_valid_subset.log \
+    discriminative-get-supervision $supervision_all_opts \
+    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
+    nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \
+    $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset.log \
+    discriminative-get-supervision $supervision_all_opts \
+    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
+    nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \
+    $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  
+  for f in $dir/{train_diagnostic,valid_diagnostic}.degs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # create degs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  degs_list=
+  for n in $(seq $num_archives_intermediate); do
+    degs_list="$degs_list ark:$dir/degs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+  
+  # The examples will go round-robin to degs_list.  
+  # To make it efficient we need to use a large 'nj', like 40, and in that case
+  # there can be too many small files to deal with, because the total number of
+  # files is the product of 'nj' by 'num_archives_intermediate', which might be
+  # quite large.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    discriminative-get-supervision $supervision_all_opts \
+    "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
+    "ark:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
+    nnet3-discriminative-get-egs $ivector_opt $egs_opts \
+    $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
+    nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "degs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the degs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  degs_list=
+  for n in $(seq $nj); do
+    degs_list="$degs_list $dir/degs_orig.$n.JOB.ark"
+  done
+  
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:$dir/degs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/degs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # degs.intermediate_archive.{1,2,...}.ark will point to degs.archive.ark
+        ln -sf degs.$archive_index.ark $dir/degs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:- \| \
+      nnet3-discriminative-copy-egs ark:- $output_archives || exit 1;
+  fi
+fi
+  
+if [ $stage -le 7 ]; then
+  echo "$0: removing temporary archives"
+  (
+    cd $dir
+    for f in $(ls -l . | grep 'degs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
+  )
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/degs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary lattices"
+  rm $dir/lat.*
+  echo "$0: removing temporary alignments and transforms"
+  # Ignore errors below because trans.* might not exist.
+  rm -f $dir/{ali,trans}.{ark,scp} 2>/dev/null || true
+fi
+
+wait
+
+echo "$0: Finished preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
new file mode 100755
index 00000000000..3d0d1e5e418
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+# Copyright 2012        Johns Hopkins University (Author: Daniel Povey)
+#           2014-2015   Vimal Manohar
+# Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.  
+# This version uses the neural-net models (version 3, i.e. the nnet3 code).
+# Creates its output in $dir/lat.*.gz
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+frames_per_chunk=50
+lattice_beam=7.0
+self_loop_scale=0.1
+acwt=0.1
+max_active=5000
+min_active=200
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1 # Fixed to 1 for now
+online_ivector_dir=
+determinize=false
+minimize=false
+ivector_scale=1.0
+parallel_opts= # ignored now
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+feat_type=  # you can set this in order to run on top of delta features, although we don't
+            # normally want to do this.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+num_threads=1 # Fixed to 1 for now
+
+if [ $# != 4 ]; then
+  echo "Usage: steps/nnet3/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+  echo "  e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats"
+  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+  echo " plus transforms."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+  echo "                           # large databases so your jobs will be smaller and"
+  echo "                           # will (individually) finish reasonably soon."
+  echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+  echo "  --num-threads  <n>                # number of threads per decoding job"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+
+extra_files=
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+cp -rH $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+  echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  echo "Making unigram grammar FST in $new_lang"
+  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
+    || exit 1;
+  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null
+
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"         
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+
+
+# if this job is interrupted by the user, we want any background jobs to be
+# killed too.
+cleanup() {
+  local pids=$(jobs -pr)
+  [ -n "$pids" ] && kill $pids
+}
+trap "cleanup" INT QUIT TERM EXIT
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  cp $srcdir/frame_subsampling_factor $dir
+fi
+
+lattice_determinize_cmd=
+if $determinize; then
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |"
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
+    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+    --frames-per-chunk=$frames_per_chunk \
+    --extra-left-context=$extra_left_context \
+    --extra-right-context=$extra_right_context \
+    --extra-left-context-initial=$extra_left_context_initial \
+    --extra-right-context-final=$extra_right_context_final \
+    --minimize=false --determinize-lattice=false \
+    --word-determinize=false --phone-determinize-lattice=false \
+    --max-active=$max_active --min-active=$min_active --beam=$beam \
+    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+    --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+    $dir/dengraph/HCLG.fst "$feats" \
+    "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.JOB.gz" || exit 1
+else
+
+  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
+  # to have at most two jobs running at each time.  The idea is that if we have stragglers 
+  # from one job, we can be processing another one at the same time.
+  rm $dir/.error 2>/dev/null
+
+  prev_pid=
+  for n in `seq $[nj+1]`; do
+    if [ $n -gt $nj ]; then
+      this_pid=
+    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+      this_pid=
+    else
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+
+      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+        --frames-per-chunk=$frames_per_chunk \
+        --extra-left-context=$extra_left_context \
+        --extra-right-context=$extra_right_context \
+        --extra-left-context-initial=$extra_left_context_initial \
+        --extra-right-context-final=$extra_right_context_final \
+        --minimize=false --determinize-lattice=false \
+        --word-determinize=false --phone-determinize=false \
+        --max-active=$max_active --min-active=$min_active --beam=$beam \
+        --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+        --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+        $dir/dengraph/HCLG.fst "$feats_subset" \
+        "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
+      this_pid=$!
+    fi
+    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
+      wait $prev_pid
+      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
+      rm $dir/.merge_error 2>/dev/null
+      echo Merging archives for data subset $prev_n
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
+      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
+      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
+      rm $dir/lat.$prev_n.*.gz
+      touch $dir/.done.$prev_n
+    fi
+    prev_n=$n
+    prev_pid=$this_pid
+  done
+fi
+
+
+echo "$0: done generating denominator lattices."
+
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
new file mode 100755
index 00000000000..a00e4cfd0e7
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -0,0 +1,490 @@
+#!/usr/bin/env python
+
+# tdnn or RNN with 'jesus layer'
+
+
+# notes on jesus layer with recurrence:
+
+#  inputs to jesus layer:
+#      - for each previous layer in regular splicing, the output of dim  --jesus-forward-output-dim
+#      - for each recurrent connection:
+#      - direct input from the recurrence                            --jesus-direct-recurrence-dim
+#      - indirect [projected] input from recurrence.                 --jesus-projected-recurrence-input-dim
+#  outputs of jesus layer:
+#     for all layers:
+#       --jesus-forward-output-dim
+#     for recurrent layers:
+#       --jesus-direct-recurrence-dim
+#       --jesus-projected-recurrence-output-dim
+
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import re, os, argparse, sys, math, warnings
+
+
+parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                 "for TDNNs creation and training",
+                                 epilog="See steps/nnet3/train_tdnn.sh for example.");
+parser.add_argument("--splice-indexes", type=str,
+                    help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. "
+                    "Note: recurrence indexes are optional, may not appear in 1st layer, and must be "
+                    "either all negative or all positive for any given layer.")
+parser.add_argument("--feat-dim", type=int,
+                    help="Raw feature dimension, e.g. 13")
+parser.add_argument("--ivector-dim", type=int,
+                    help="iVector dimension, e.g. 100", default=0)
+parser.add_argument("--include-log-softmax", type=str,
+                    help="add the final softmax layer ", default="true", choices = ["false", "true"])
+parser.add_argument("--xent-regularize", type=float,
+                    help="For chain models, if nonzero, add a separate output for cross-entropy "
+                    "regularization (with learning-rate-factor equal to the inverse of this)",
+                    default=0.0)
+parser.add_argument("--use-repeated-affine", type=str,
+                    help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
+                    default="true", choices = ["false", "true"])
+parser.add_argument("--final-layer-learning-rate-factor", type=float,
+                    help="Learning-rate factor for final affine component",
+                    default=1.0)
+parser.add_argument("--recurrent-projection-learning-rate-factor", type=float,
+                    help="Learning-rate factor for recurrent projections",
+                    default=10.0)
+parser.add_argument("--jesus-hidden-dim", type=int,
+                    help="hidden dimension of Jesus layer.", default=10000)
+parser.add_argument("--jesus-forward-output-dim", type=int,
+                    help="part of output dimension of Jesus layer that goes to next layer",
+                    default=1000)
+parser.add_argument("--jesus-forward-input-dim", type=int,
+                    help="Input dimension of Jesus layer that comes from affine projection "
+                    "from the previous layer (same as output dim of forward affine transform)",
+                    default=1000)
+parser.add_argument("--final-hidden-dim", type=int,
+                    help="Final hidden layer dimension-- or if <0, the same as "
+                    "--jesus-forward-input-dim", default=-1)
+parser.add_argument("--jesus-direct-recurrence-dim", type=int,
+                    help="part of output dimension of Jesus layer that comes directly from "
+                    "different time instance of the same Jesus layer", default=1000)
+parser.add_argument("--jesus-projected-recurrence-output-dim", type=int,
+                    help="part of output dimension of Jesus layer (in recurrent layers) "
+                    "that is destined for projection to dimension "
+                    "--jesus-projected-recurrence-input-dim", default=500)
+parser.add_argument("--jesus-projected-recurrence-input-dim", type=int,
+                    help="part of input dimension of Jesus layer that comes via "
+                    "projection from the output of the same Jesus layer at different time",
+                    default=200)
+parser.add_argument("--num-jesus-blocks", type=int,
+                    help="number of blocks in Jesus layer.  All configs of the form "
+                    "--jesus-*-dim will be rounded up to be a multiple of this.",
+                    default=100);
+parser.add_argument("--jesus-stddev-scale", type=float,
+                    help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)",
+                    default=1.0)
+parser.add_argument("--clipping-threshold", type=float,
+                    help="clipping threshold used in ClipGradient components (only relevant if "
+                    "recurrence indexes are specified).  If clipping-threshold=0 no clipping is done",
+                    default=15)
+parser.add_argument("--num-targets", type=int,
+                    help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+parser.add_argument("config_dir",
+                    help="Directory to write config files and variables");
+
+print(' '.join(sys.argv))
+
+args = parser.parse_args()
+
+if not os.path.exists(args.config_dir):
+    os.makedirs(args.config_dir)
+
+## Check arguments.
+if args.splice_indexes is None:
+    sys.exit("--splice-indexes argument is required");
+if args.feat_dim is None or not (args.feat_dim > 0):
+    sys.exit("--feat-dim argument is required");
+if args.num_targets is None or not (args.num_targets > 0):
+    sys.exit("--num-targets argument is required");
+if args.num_jesus_blocks < 1:
+    sys.exit("invalid --num-jesus-blocks value");
+if args.final_hidden_dim < 0:
+    args.final_hidden_dim = args.jesus_forward_input_dim
+
+for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim",
+              "jesus_direct_recurrence_dim", "jesus_projected_recurrence_output_dim",
+              "jesus_projected_recurrence_input_dim", "final_hidden_dim" ]:
+    old_val = getattr(args, name)
+    if old_val % args.num_jesus_blocks != 0:
+        new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks)
+        printable_name = '--' + name.replace('_', '-')
+        print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3}: '.format(
+                printable_name, old_val, new_val, args.num_jesus_blocks))
+        setattr(args, name, new_val);
+
+
+## Work out splice_array and recurrence_array,
+## e.g. for
+## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'
+## we would have
+##   splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0]
+## and
+##  recurrence_array = [ [], [-3], [-3], [-6,-3] ]
+## Note, recurrence_array[0] must be empty; and any element of recurrence_array
+## may be empty.  Also it cannot contain zeros, or both positive and negative elements
+## at the same layer.
+splice_array = []
+recurrence_array = []
+left_context = 0
+right_context = 0
+split_on_spaces = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+if len(split_on_spaces) < 2:
+    sys.exit("invalid --splice-indexes argument, too short: "
+             + args.splice_indexes)
+try:
+    for string in split_on_spaces:
+        this_layer = len(splice_array)
+        split_on_colon = string.split(":")  # there will only be a colon if
+                                            # there is recurrence at this layer.
+        if len(split_on_colon) < 1 or len(split_on_colon) > 2 or (this_layer == 0 and len(split_on_colon) > 1):
+            sys.exit("invalid --splice-indexes argument: " + args.splice_indexes)
+        if len(split_on_colon) == 1:
+            split_on_colon.append("")
+        int_list = []
+        this_splices = [ int(x) for x in split_on_colon[0].split(",") ]
+        this_recurrence = [ int(x) for x in split_on_colon[1].split(",") if x ]
+        splice_array.append(this_splices)
+        recurrence_array.append(this_recurrence)
+        if (len(this_splices) < 1):
+            sys.exit("invalid --splice-indexes argument [empty splices]: " + args.splice_indexes)
+        if len(this_recurrence) > 1 and this_recurrence[0] * this_recurrence[-1] <= 0:
+            sys.exit("invalid --splice-indexes argument [invalid recurrence indexes; would not be computable."
+                     + args.splice_indexes)
+        if not this_splices == sorted(this_splices):
+            sys.exit("elements of --splice-indexes must be sorted: "
+                     + args.splice_indexes)
+        left_context += -this_splices[0]
+        right_context += this_splices[-1]
+except ValueError as e:
+    sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e))
+left_context = max(0, left_context)
+right_context = max(0, right_context)
+num_hidden_layers = len(splice_array)
+input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
+
+f = open(args.config_dir + "/vars", "w")
+print('left_context=' + str(left_context), file=f)
+print('right_context=' + str(right_context), file=f)
+print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+f.close()
+
+
+f = open(args.config_dir + "/init.config", "w")
+print('# Config file for initializing neural network prior to', file=f)
+print('# preconditioning matrix computation', file=f)
+print('input-node name=input dim=' + str(args.feat_dim), file=f)
+list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ]
+if args.ivector_dim > 0:
+    print('input-node name=ivector dim=' + str(args.ivector_dim), file=f)
+    list.append('ReplaceIndex(ivector, t, 0)')
+# example of next line:
+# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"
+print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f)
+f.close()
+
+
+for l in range(1, num_hidden_layers + 1):
+    # the following summarizes the structure of the layers:  Here, the Jesus component includes ReLU at its input and output, and renormalize
+    #   at its output after the ReLU.
+    # layer1: splice + LDA-transform + affine + ReLU + renormalize
+    # layerX [non-recurrent]: splice + Jesus + affine + ReLU
+    # layerX [recurrent]: splice + Jesus + renormalize + split up:  -> [forward] affine + ReLU
+    #                                                               -> [direct-recurrent]
+    #                                                               -> [projected-recurrent, one per delay]: affine + ReLU
+    # Inside the jesus component is:
+    #  [permute +] ReLU + repeated-affine + ReLU + repeated-affine
+    # [we make the repeated-affine the last one so we don't have to redo that in backprop].
+    # We follow this with a post-jesus composite component containing the operations:
+    #  [permute +] ReLU + renormalize
+    # call this post-jesusN.
+    # After this we use dim-range nodes to split up the output into
+    # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ]
+    # parts;
+    # and nodes for the jesusN-forward-affine and jesusN-recurrent-affine-offsetN
+    # and jesusN-recurrent-affine-offsetN-clip
+    # computations.
+
+    f = open(args.config_dir + "/layer{0}.config".format(l), "w")
+    print('# Config file for layer {0} of the network'.format(l), file=f)
+    if l == 1:
+        print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'.
+              format(args.config_dir), file=f)
+        splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ]
+        if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)')
+        orig_input='Append({0})'.format(', '.join(splices))
+        # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)'
+        print('component-node name=lda component=lda input={0}'.format(orig_input),
+              file=f)
+        # after the initial LDA transform, put a trainable affine layer and a ReLU, followed
+        # by a NormalizeComponent.
+        print('component name=affine1 type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} bias-stddev=0'.format(
+                input_dim, args.jesus_forward_input_dim), file=f)
+        print('component-node name=affine1 component=affine1 input=lda',
+              file=f)
+        # the ReLU after the affine
+        print('component name=relu1 type=RectifiedLinearComponent dim={1}'.format(
+                l, args.jesus_forward_input_dim), file=f)
+        print('component-node name=relu1 component=relu1 input=affine1', file=f)
+        # the renormalize component after the ReLU
+        print ('component name=renorm1 type=NormalizeComponent dim={0} '.format(
+                args.jesus_forward_input_dim), file=f)
+        print('component-node name=renorm1 component=renorm1 input=relu1', file=f)
+        cur_output = 'renorm1'
+        cur_affine_output_dim = args.jesus_forward_input_dim
+    else:
+        splices = []
+        spliced_dims = []
+        for offset in splice_array[l-1]:
+            # the connection from the previous layer
+            if l == 2:
+                splices.append('Offset(renorm1, {0})'.format(offset))
+            else:
+                splices.append('Offset(jesus{0}-forward-output-affine, {1})'.format(l-1, offset))
+            spliced_dims.append(args.jesus_forward_input_dim)
+        for offset in recurrence_array[l-1]:
+            # the direct recurrence
+            splices.append('IfDefined(Offset(jesus{0}-direct-output, {1}))'.format(l, offset))
+            spliced_dims.append(args.jesus_direct_recurrence_dim)
+            # the indirect recurrence (via projection)
+            splices.append('IfDefined(Offset(jesus{0}-recurrent-affine-offset{1}-clip, {1}))'.format(l, offset))
+            spliced_dims.append(args.jesus_projected_recurrence_input_dim)
+
+        # get the input to the Jesus layer.
+        cur_input = 'Append({0})'.format(', '.join(splices))
+        cur_dim = sum(spliced_dims)
+
+        this_layer_is_recurrent = (len(recurrence_array[l-1]) != 0)
+        this_jesus_output_dim = args.jesus_forward_output_dim + (
+            (args.jesus_projected_recurrence_output_dim +
+             args.jesus_direct_recurrence_dim) if this_layer_is_recurrent else 0)
+
+        # As input to the Jesus component we'll append the spliced input and
+        # recurrent input, and the first thing inside the component that we do
+        # is rearrange the dimensions so that things pertaining to a particular
+        # block stay together.
+
+        column_map = []
+        for x in range(0, args.num_jesus_blocks):
+            dim_offset = 0
+            for src_splice in spliced_dims:
+                src_block_size = src_splice / args.num_jesus_blocks
+                for y in range(0, src_block_size):
+                    column_map.append(dim_offset + (x * src_block_size) + y)
+                dim_offset += src_splice
+        if sorted(column_map) != range(0, sum(spliced_dims)):
+            print("column_map is " + str(column_map))
+            print("num_jesus_blocks is " + str(args.num_jesus_blocks))
+            print("spliced_dims is " + str(spliced_dims))
+            sys.exit("code error creating new column order")
+
+        need_input_permute_component = (column_map != range(0, sum(spliced_dims)))
+
+        # Now add the jesus component.
+        num_sub_components = (5 if need_input_permute_component else 4);
+        print('component name=jesus{0} type=CompositeComponent num-components={1}'.format(
+                l, num_sub_components), file=f, end='')
+        # print the sub-components of the CompositeComopnent on the same line.
+        # this CompositeComponent has the same effect as a sequence of
+        # components, but saves memory.
+        if need_input_permute_component:
+            print(" component1='type=PermuteComponent column-map={1}'".format(
+                    l, ','.join([str(x) for x in column_map])), file=f, end='')
+        print(" component{0}='type=RectifiedLinearComponent dim={1}'".format(
+                (2 if need_input_permute_component else 1),
+                cur_dim), file=f, end='')
+
+        if args.use_repeated_affine == "true":
+            print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                  "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                    (3 if need_input_permute_component else 2),
+                    cur_dim, args.jesus_hidden_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks),
+                    0.5 * args.jesus_stddev_scale),
+                  file=f, end='')
+        else:
+            print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                  "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                    (3 if need_input_permute_component else 2),
+                    cur_dim, args.jesus_hidden_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)),
+                  file=f, end='')
+
+
+        print(" component{0}='type=RectifiedLinearComponent dim={1}'".format(
+                (4 if need_input_permute_component else 3),
+                args.jesus_hidden_dim), file=f, end='')
+
+
+
+        if args.use_repeated_affine == "true":
+            print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                  "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                    (5 if need_input_permute_component else 4),
+                    args.jesus_hidden_dim,
+                    this_jesus_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks),
+                    0.5 * args.jesus_stddev_scale),
+                  file=f, end='')
+        else:
+            print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                  "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                    (5 if need_input_permute_component else 4),
+                    args.jesus_hidden_dim,
+                    this_jesus_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))),
+                  file=f, end='')
+
+        print("", file=f) # print newline.
+        print('component-node name=jesus{0} component=jesus{0} input={1}'.format(
+                l, cur_input), file=f)
+
+        # now print the post-Jesus component which consists of [permute +] ReLU
+        # + renormalize.  we only need the permute component if this is a
+        # recurrent layer.
+
+        num_sub_components = (3 if this_layer_is_recurrent else 2);
+        print('component name=post-jesus{0} type=CompositeComponent num-components={1}'.format(
+                l, num_sub_components), file=f, end='')
+        if this_layer_is_recurrent:
+            column_map = []
+            output_part_dims = [ args.jesus_forward_output_dim,
+                                 args.jesus_direct_recurrence_dim,
+                                 args.jesus_projected_recurrence_output_dim ]
+            if sum(output_part_dims) != this_jesus_output_dim:
+                sys.exit("code error")
+            total_block_size = this_jesus_output_dim / args.num_jesus_blocks
+            previous_part_dims_sum = 0
+            for part_dim in output_part_dims:
+                within_block_offset = previous_part_dims_sum / args.num_jesus_blocks
+                within_block_dim = part_dim / args.num_jesus_blocks
+                for x in range(0, args.num_jesus_blocks):
+                    for y in range(0, within_block_dim):
+                        column_map.append(x * total_block_size + within_block_offset + y)
+                previous_part_dims_sum += part_dim
+            if sorted(column_map) != range(0, this_jesus_output_dim):
+                print("column_map is " + str(column_map))
+                print("output_part_dims is " + str(output_part_dims))
+                sys.exit("code error creating new column order")
+            print(" component1='type=PermuteComponent column-map={1}'".format(
+                    l, ','.join([str(x) for x in column_map ])), file=f, end='')
+
+        # still within the post-Jesus component, print the ReLU
+        print(" component{0}='type=RectifiedLinearComponent dim={1}'".format(
+                (2 if this_layer_is_recurrent else 1),
+                this_jesus_output_dim), file=f, end='')
+        # still within the post-Jesus component, print the NormalizeComponent
+        print(" component{0}='type=NormalizeComponent dim={1} '".format(
+                (3 if this_layer_is_recurrent else 2),
+                this_jesus_output_dim), file=f, end='')
+        print("", file=f) # print newline.
+        print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l),
+              file=f)
+
+        if len(recurrence_array[l-1]) != 0:
+            # This is a recurrent layer -> print the dim-range nodes.
+            dim_offset = 0
+            print('dim-range-node name=jesus{0}-forward-output input-node=post-jesus{0} '
+                  'dim={1} dim-offset={2}'.format(l, args.jesus_forward_output_dim, dim_offset), file=f)
+            dim_offset += args.jesus_forward_output_dim
+            print('dim-range-node name=jesus{0}-direct-output input-node=post-jesus{0} '
+                  'dim={1} dim-offset={2}'.format(l, args.jesus_direct_recurrence_dim, dim_offset), file=f)
+            dim_offset += args.jesus_direct_recurrence_dim
+            print('dim-range-node name=jesus{0}-projected-output input-node=post-jesus{0} '
+                  'dim={1} dim-offset={2}'.format(l, args.jesus_projected_recurrence_output_dim,
+                                                  dim_offset), file=f)
+            input_to_forward_affine = 'jesus{0}-forward-output'.format(l)
+        else:
+            input_to_forward_affine = 'post-jesus{0}'.format(l)
+
+        # handle the forward output, we need an affine node for this:
+        cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim)
+        print('component name=forward-affine{0} type=NaturalGradientAffineComponent '
+              'input-dim={1} output-dim={2} bias-stddev=0'.
+              format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f)
+        print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input={1}'.format(
+                l, input_to_forward_affine), file=f)
+        # for each recurrence delay, create an affine node followed by a
+        # clip-gradient node.  [if there are multiple recurrences in the same layer,
+        # each one gets its own affine projection.]
+
+        # The reason we set the param-stddev to 0 is out of concern that if we
+        # initialize to nonzero, this will encourage the corresponding inputs at
+        # the jesus layer to become small (to remove this random input), which
+        # in turn will make this component learn slowly (due to small
+        # derivatives).  we set the bias-mean to 0.001 so that the ReLUs on the
+        # input of the Jesus layer are in the part of the activation that has a
+        # nonzero derivative- otherwise with this setup it would never learn.
+        for delay in recurrence_array[l-1]:
+            print('component name=jesus{0}-recurrent-affine-offset{1} type=NaturalGradientAffineComponent '
+                  'input-dim={2} output-dim={3} learning-rate-factor={4} param-stddev=0 bias-stddev=0 bias-mean=0.001'.
+                  format(l, delay,
+                         args.jesus_projected_recurrence_output_dim,
+                         args.jesus_projected_recurrence_input_dim,
+                         args.recurrent_projection_learning_rate_factor), file=f)
+            print('component-node name=jesus{0}-recurrent-affine-offset{1} component=jesus{0}-recurrent-affine-offset{1} '
+                  'input=jesus{0}-projected-output'.format(l, delay), file=f)
+            print('component name=jesus{0}-recurrent-affine-offset{1}-clip type=ClipGradientComponent '
+                  'dim={2} clipping-threshold={3} '.format(l, delay, args.jesus_projected_recurrence_input_dim,
+                                                           args.clipping_threshold), file=f)
+            print('component-node name=jesus{0}-recurrent-affine-offset{1}-clip component=jesus{0}-recurrent-affine-offset{1}-clip '
+                  'input=jesus{0}-recurrent-affine-offset{1}'.format(l, delay), file=f)
+
+        cur_output = 'jesus{0}-forward-output-affine'.format(l)
+
+
+    # with each new layer we regenerate the final-affine component, with a ReLU before it
+    # because the layers we printed don't end with a nonlinearity.
+    print('component name=final-relu type=RectifiedLinearComponent dim={0}'.format(
+            cur_affine_output_dim), file=f)
+    print('component-node name=final-relu component=final-relu input={0}'.format(cur_output),
+          file=f)
+    print('component name=final-affine type=NaturalGradientAffineComponent '
+          'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format(
+            cur_affine_output_dim, args.num_targets,
+            args.final_layer_learning_rate_factor), file=f)
+    print('component-node name=final-affine component=final-affine input=final-relu',
+          file=f)
+    # printing out the next two, and their component-nodes, for l > 1 is not
+    # really necessary as they will already exist, but it doesn't hurt and makes
+    # the structure clearer.
+    if args.include_log_softmax == "true":
+        print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax component=final-log-softmax '
+              'input=final-affine', file=f)
+        print('output-node name=output input=final-log-softmax', file=f)
+    else:
+        print('output-node name=output input=final-affine', file=f)
+
+    if args.xent_regularize != 0.0:
+        # This block prints the configs for a separate output that will be
+        # trained with a cross-entropy objective in the 'chain' models... this
+        # has the effect of regularizing the hidden parts of the model.  we use
+        # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+        # 1.0 / args.xent_regularize is suitable as it means the xent
+        # final-layer learns at a rate independent of the regularization
+        # constant; and the 0.5 was tuned so as to make the relative progress
+        # similar in the xent and regular final layers.
+        print('component name=final-affine-xent type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
+                cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
+        print('component-node name=final-affine-xent component=final-affine-xent input=final-relu',
+              file=f)
+        print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '
+              'input=final-affine-xent', file=f)
+        print('output-node name=output-xent input=final-log-softmax-xent', file=f)
+
+    f.close()
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
index 24666b8bd02..c36de8c16bf 100755
--- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
+++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 
 # script showing use of nnet3_to_dot.py
-# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti). 
+# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti).
 
 # Begin configuration section.
 component_attributes="name,type"
 node_prefixes=""
+info_bin=nnet3-am-info
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
@@ -20,7 +21,7 @@ if [ $# != 3 ]; then
   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
   echo "                                                # will be clustered together in the dot-graph"
 
-  
+
   exit 1;
 fi
 
@@ -29,10 +30,10 @@ dot_file=$2
 output_file=$3
 
 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"}
-nnet3-am-info $model | \
+$info_bin $model | \
   steps/nnet3/dot/nnet3_to_dot.py \
     --component-attributes "$component_attributes" \
-    $attr  > $dot_file
+    $attr $dot_file
 
 command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
-dot -Tpng $dot_file -o $output_file
+dot -Tpdf $dot_file -o $output_file
diff --git a/egs/wsj/s5/steps/nnet3/report/README b/egs/wsj/s5/steps/nnet3/report/README
new file mode 100644
index 00000000000..848b4e32fad
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/README
@@ -0,0 +1,6 @@
+These python scripts are used to parse the log files generated by nnet3 scripts.
+
+Usage:
+  steps/nnet3/report/nnet3_log_parse.py --key log-probability exp/chain/tdnn_4q
+
+  steps/nnet3/report/nnet3_log_parse.py --key accuracy exp/nnet3/tdnn_sp
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py
new file mode 100755
index 00000000000..225906eea1b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+# script to parse the train logs generated by nnet-compute-prob
+from __future__ import division
+import sys, glob, re, numpy, math, datetime, argparse
+from subprocess import Popen, PIPE
+
+def parse_train_logs(exp_dir):
+  train_log_files = "%s/log/train.*.log" % (exp_dir)
+  train_log_proc = Popen('grep -e Accounting {0}'.format(train_log_files),
+                          shell=True,
+                          stdout=PIPE,
+                          stderr=PIPE)
+  train_log_lines = train_log_proc.communicate()[0]
+  parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*")
+  train_times = {}
+  for line in train_log_lines.split('\n'):
+    mat_obj = parse_regex.search(line)
+    if mat_obj is not None:
+        groups = mat_obj.groups()
+        try:
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+        except KeyError:
+            train_times[int(groups[0])] = {}
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+  iters = train_times.keys()
+  for iter in iters:
+      values = train_times[iter].values()
+      train_times[iter] = max(values)
+  return train_times
+
+def parse_prob_logs(exp_dir, key = 'accuracy'):
+    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
+    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
+    train_prob_proc = Popen('grep -e {0} {1}'.format(key, train_prob_files),
+                            shell=True,
+                            stdout=PIPE,
+                            stderr=PIPE)
+    train_prob_strings = train_prob_proc.communicate()[0]
+    valid_prob_proc = Popen('grep -e {0} {1}'.format(key, valid_prob_files),
+                            shell=True,
+                            stdout=PIPE,
+                            stderr=PIPE)
+    valid_prob_strings = valid_prob_proc.communicate()[0]
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames.
+    parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-]+) .*per frame")
+    train_loss={}
+    valid_loss={}
+
+
+    for line in train_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                train_loss[int(groups[0])] = groups[2]
+
+    for line in valid_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                valid_loss[int(groups[0])] = groups[2]
+    iters = list(set(valid_loss.keys()).intersection(train_loss.keys()))
+    iters.sort()
+    return numpy.array(map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters))
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="Prints accuracy/log-probability across iterations")
+  parser.add_argument("--key", type=str, default="accuracy",
+                       help="Value to print out")
+  parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
+
+  args = parser.parse_args()
+  exp_dir = args.exp_dir
+  times = parse_train_logs(exp_dir)
+  data = parse_prob_logs(exp_dir, key = args.key)
+  print "%Iter\tduration\ttrain_loss\tvalid_loss\tdifference"
+  for x in data:
+    try:
+      print "%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])
+    except KeyError:
+      continue
+
+  total_time = 0
+  for iter in times.keys():
+    total_time += times[iter]
+  print "Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time)))
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py b/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py
new file mode 100755
index 00000000000..a910d42d6b1
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/nnet3_progress_log_parse.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# script to parse the train logs generated by nnet-compute-prob
+from __future__ import division
+import sys, glob, re, numpy, math, datetime, argparse
+from subprocess import Popen, PIPE
+
+#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
+
+def parse_difference_string(string):
+    dict = {}
+    for parts in string.split():
+        sub_parts = parts.split(":")
+        dict[sub_parts[0]] = float(sub_parts[1])
+    return dict
+
+def parse_progress_logs(exp_dir, pattern):
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    progress_per_iter = {}
+    component_names = set([])
+    progress_log_proc = Popen('grep -e "{0}" {1}'.format(pattern, progress_log_files),
+                              shell=True,
+                              stdout=PIPE,
+                              stderr=PIPE)
+    progress_log_lines = progress_log_proc.communicate()[0]
+    parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern))
+    for line in progress_log_lines.split("\n") :
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        iteration = groups[0]
+        differences = parse_difference_string(groups[1])
+        component_names  = component_names.union(differences.keys())
+        progress_per_iter[int(iteration)] = differences
+
+    component_names = list(component_names)
+    component_names.sort()
+    # rearranging the data into an array
+    data = []
+    data.append(["iteration"]+component_names)
+    max_iter = max(progress_per_iter.keys())
+    for iter in range(max_iter + 1):
+        try:
+            component_dict = progress_per_iter[iter]
+        except KeyError:
+            continue
+        iter_values = []
+        for component_name in component_names:
+            try:
+                iter_values.append(component_dict[component_name])
+            except KeyError:
+                # the component was not found this iteration, may be because of layerwise discriminative training
+                iter_values.append(0)
+        data.append([iter] + iter_values)
+    
+    return data
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(description="Prints accuracy/log-probability across iterations")
+  parser.add_argument("--key", type=str, default="relative-difference",
+                       help="Value to print out", choices = ["relative-difference", 'difference'])
+  parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
+
+  args = parser.parse_args()
+  exp_dir = args.exp_dir
+  if args.key == "relative-difference":
+      key = "Relative parameter differences"
+  else:
+      key = "Parameter differences"
+  data = parse_progress_logs(exp_dir, key)
+  for row in data:
+      print " ".join(map(lambda x:str(x),row))
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
new file mode 100755
index 00000000000..5c042c3a15e
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import sys
+import warnings
+import copy
+import imp
+import ast
+import scipy.signal as signal
+import numpy as np
+
+nodes = imp.load_source('', 'steps/nnet3/components.py')
+
+
+def AddPerDimAffineLayer(config_lines, name, input, input_window):
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i) 
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineLayer(config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats, num_feats)
+
+    return [output_descriptor, filter_context, filter_context]
+
+
+def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False):
+    # low-pass smoothing of input was specified. so we will add a low-pass filtering layer
+    lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0)
+    lp_filter = np.append(lp_filter, 0)
+    nodes.WriteKaldiMatrix(np.array([lp_filter]), lpfilt_filename)
+    filter_context = int((num_lpfilter_taps - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+    input_x_dim = len(filter_input_splice_indexes)
+    input_y_dim = input['dimension']
+    input_z_dim = 1
+    filt_x_dim = len(filter_input_splice_indexes)
+    filt_y_dim = 1
+    filt_x_step = 1
+    filt_y_step = 1
+    input_vectorization = 'zyx'
+
+    tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name,
+                                                     filter_input_descriptor,
+                                                     input_x_dim, input_y_dim, input_z_dim,
+                                                     filt_x_dim, filt_y_dim,
+                                                     filt_x_step, filt_y_step,
+                                                     1, input_vectorization,
+                                                     filter_bias_file = lpfilt_filename,
+                                                     is_updatable = is_updatable)
+
+
+    return [tdnn_input_descriptor, filter_context, filter_context]
+
+
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseSpliceString(splice_indexes, label_delay=None):
+    ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        sys.exit("invalid --splice-indexes argument, too short: "
+                 + args.splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                sys.exit("invalid --splice-indexes argument, too-short element: "
+                         + args.splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                sys.exit("elements of --splice-indexes must be sorted: "
+                         + args.splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        sys.exit("invalid --splice-indexes argument " + args.splice_indexes + e)
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+    num_hidden_layers = len(splice_array)
+    input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+if __name__ == "__main__":
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training",
+                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str,
+                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0")
+    parser.add_argument("--feat-dim", type=int,
+                        help="Raw feature dimension, e.g. 13")
+    parser.add_argument("--ivector-dim", type=int,
+                        help="iVector dimension, e.g. 100", default=0)
+    parser.add_argument("--include-log-softmax", type=str,
+                        help="add the final softmax layer ", default="true", choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+    parser.add_argument("--pnorm-input-dim", type=int,
+                        help="input dimension to p-norm nonlinearities")
+    parser.add_argument("--pnorm-output-dim", type=int,
+                        help="output dimension of p-norm nonlinearities")
+    parser.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+    parser.add_argument("--pool-type", type=str, default = 'none',
+                        help="Type of pooling to be used.", choices = ['low-pass', 'sum', 'max', 'weighted-average', 'per-dim-weighted-average', 'none'])
+    parser.add_argument("--pool-window", type=int, default = None,
+                        help="Width of the pooling window")
+    parser.add_argument("--pool-lpfilter-width", type=float,
+                        default = None, help="Nyquist frequency of the lpfilter to be used for pooling")
+    parser.add_argument("--use-presoftmax-prior-scale", type=str,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = "true")
+    parser.add_argument("--num-targets", type=int,
+                        help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.splice_indexes is None:
+        sys.exit("--splice-indexes argument is required")
+    if args.feat_dim is None or not (args.feat_dim > 0):
+        sys.exit("--feat-dim argument is required")
+    if args.num_targets is None or not (args.num_targets > 0):
+        sys.exit("--num-targets argument is required")
+    if (args.subset_dim < 0):
+        sys.exit("--subset-dim has to be non-negative")
+    if (args.pool_window is not None) and (args.pool_window <= 0):
+        sys.exit("--pool-window has to be positive")
+
+    if not args.relu_dim is None:
+        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
+            sys.exit("--relu-dim argument not compatible with "
+                     "--pnorm-input-dim or --pnorm-output-dim options");
+        nonlin_input_dim = args.relu_dim
+        nonlin_output_dim = args.relu_dim
+    else:
+        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
+            sys.exit("--relu-dim not set, so expected --pnorm-input-dim and "
+                     "--pnorm-output-dim to be provided.");
+        nonlin_input_dim = args.pnorm_input_dim
+        nonlin_output_dim = args.pnorm_output_dim
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(args.config_dir)
+    if args.use_presoftmax_prior_scale == "true":
+        use_presoftmax_prior_scale = True
+    else:
+        use_presoftmax_prior_scale = False
+
+    parsed_splice_output = ParseSpliceString(args.splice_indexes.strip())
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[args.config_dir + '/init.config'] = init_config_lines
+
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+        # if specified, pool the input from the previous layer
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            if args.pool_type != "none" and args.pool_window is None:
+                raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(args.pool_type))
+            if args.pool_type in set(["low-pass", "weighted-average"]):
+                if args.pool_type == "weighted-average":
+                    lpfilter_is_updatable = True
+                else:
+                    lpfilter_is_updatable = False
+                # low-pass filter the input to smooth it before the sub-sampling
+                [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines,
+                                                                                      'Tdnn_input_smoother_{0}'.format(i),
+                                                                                       prev_layer_output,
+                                                                                       args.pool_lpfilter_width,
+                                                                                       args.pool_window,
+                                                                                       args.config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i),
+                                                                                       is_updatable = lpfilter_is_updatable)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            if args.pool_type == "per-dim-weighted-average":
+                # add permute component to shuffle the feature columns of the Append descriptor output so
+                # that columns corresponding to the same feature index are contiguous
+                # add a block-affine component to collapse all the feature indexes across time steps into a single value
+                [prev_layer_output, cur_left_context, cur_right_context] = AddPerDimAffineLayer(config_lines,
+                                                                                            'Tdnn_input_PDA_{0}'.format(i),
+                                                                                            prev_layer_output,
+                                                                                            args.pool_window)
+    
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            if args.pool_type == "sum":
+                raise NotImplementedError("Sum-pooling has not been tested yet.")
+
+            if args.pool_type == "max" :
+                raise NotImplementedError("Max-pooling component needs to be reimplemented for this.")
+
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if args.subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, args.subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : args.subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                    prev_layer_output, nonlin_output_dim, norm_target_rms = 1.0 if i < num_hidden_layers -1 else args.final_layer_normalize_target)
+        # a final layer is added after each new layer as we are generating configs for layer-wise discriminative training
+        nodes.AddFinalLayer(config_lines, prev_layer_output, args.num_targets,
+                           use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                           prior_scale_file = prior_scale_file,
+                           include_log_softmax = True if args.include_log_softmax == "true" else False)
+
+        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(args.config_dir + "/vars", "w")
+    print('left_context=' + str(left_context), file=f)
+    print('right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
new file mode 100755
index 00000000000..773e10ccab6
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -0,0 +1,660 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+pnorm_input_dim=3000
+pnorm_output_dim=300
+relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+presoftmax_prior_scale_power=-0.25
+use_presoftmax_prior_scale=true
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-6
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+# count space-separated fields in splice_indexes to get num-hidden-layers.
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+# note: hidden layers which are composed of one or more components,
+# so hidden layer indexing is different from component count
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+            # only relevant for "raw" features, not lda.
+feat_type=raw  # or set to 'lda' to use LDA features.
+align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
+align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
+realign_times=          # List of times on which we realign.  Each time is
+                        # floating point number strictly between 0 and 1, which
+                        # will be multiplied by the num-iters to get an iteration
+                        # number.
+num_jobs_align=30       # Number of jobs for realignment
+# End configuration section.
+frames_per_eg=8 # to be passed on to get_egs.sh
+subset_dim=0
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
+  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
+  echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
+  echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
+  echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+dir=$4
+
+if [ ! -z "$realign_times" ]; then
+  [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1
+  [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1
+fi
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
+[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1
+[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1
+
+nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $alidir/tree $dir
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+case $feat_type in
+  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+    ;;
+  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
+   # get num-rows in lda matrix, which is the lda feature dim.
+   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
+    ;;
+  *)
+   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
+esac
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+
+if [ $stage -le -5 ]; then
+  echo "$0: creating neural net configs";
+
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --splice-indexes "$splice_indexes"  \
+    --subset-dim "$subset_dim" \
+    --feat-dim $feat_dim \
+    --ivector-dim $ivector_dim  \
+     $dim_opts \
+    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
+    --num-targets  $num_leaves  \
+   $dir/configs || exit 1;
+
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# left_context=(something)
+# right_context=(something)
+# num_hidden_layers=(something)
+. $dir/configs/vars || exit 1;
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir $transform_dir)
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
+      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+      --cmd "$cmd" $egs_opts \
+      --frames-per-eg $frames_per_eg \
+      $data $alidir $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+
+if [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -2 ]; then
+  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
+  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
+
+  # obtains raw pdf count
+  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
+     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
+  $cmd $dir/log/sum_pdf_counts.log \
+       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
+  rm $dir/pdf_counts.*
+
+  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
+     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
+        num_pdfs=NF-2;  average_count = total/num_pdfs;
+        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
+        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
+     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
+  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
+fi
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat and
+  # presoftmax_prior_scale.vec.
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+  # Convert to .mdl, train the transitions, set the priors.
+  $cmd $dir/log/init_mdl.log \
+    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+for realign_time in $realign_times; do
+  # Work out the iterations on which we will re-align, if the --realign-times
+  # option was used.  This is slightly approximate.
+  ! perl -e "exit($realign_time > 0.0 && $realign_time < 1.0 ? 0:1);" && \
+    echo "Invalid --realign-times option $realign_times: elements must be strictly between 0 and 1.";
+  # the next formula is based on the one for mix_up_iter above.
+  realign_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters $realign_time) || exit 1;
+  realign_this_iter[$realign_iter]=$realign_time
+done
+
+cur_egs_dir=$egs_dir
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+  if [ ! -z "${realign_this_iter[$x]}" ]; then
+    prev_egs_dir=$cur_egs_dir
+    cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
+  fi
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    if [ ! -z "${realign_this_iter[$x]}" ]; then
+      time=${realign_this_iter[$x]}
+
+      echo "Getting average posterior for purposes of adjusting the priors."
+      # Note: this just uses CPUs, using a smallish subset of data.
+      # always use the first egs archive, which makes the script simpler;
+      # we're using different random subsets of it.
+      rm $dir/post.$x.*.vec 2>/dev/null
+      $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
+        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
+        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
+        matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+      sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+      $cmd $dir/log/vector_sum.$x.log \
+        vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+      rm $dir/post.$x.*.vec;
+
+      echo "Re-adjusting priors based on computed posteriors"
+      $cmd $dir/log/adjust_priors.$x.log \
+        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
+
+      sleep 2
+
+      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
+        --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
+        --iter $x $data $lang $dir $dir/ali_$time || exit 1
+
+      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
+        $prev_egs_dir $cur_egs_dir || exit 1
+
+      if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
+        steps/nnet3/remove_egs.sh $prev_egs_dir
+      fi
+    fi
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+            "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+           "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    mdl=$dir/$iter.mdl
+    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+       --enforce-sum-to-one=true --enforce-positive-weights=true \
+       --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
+    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+fi
+
+if [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purposes of adjusting the priors."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm $dir/post.$x.*.vec;
+
+  echo "Re-adjusting priors based on computed posteriors"
+  $cmd $dir/log/adjust_priors.final.log \
+    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+fi
+
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $cur_egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $cur_egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
new file mode 100755
index 00000000000..de8b0519009
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
+#           2014-2015  Vimal Manohar
+# Apache 2.0.
+
+set -e
+set -o pipefail
+
+# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
+# using egs obtained by steps/nnet3/get_egs_discriminative.sh
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4       # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+                   # Be careful with this: we actually go over the data
+                   # num-epochs * frame-subsampling-factor times, due to
+                   # using different data-shifts.
+use_gpu=true
+truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
+                          # near the edges.  (counts subsampled frames).
+apply_deriv_weights=true
+run_diagnostics=true
+learning_rate=0.00002
+max_param_change=2.0
+scale_max_param_change=false # if this option is used, scale it by num-jobs.
+
+effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
+acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
+boost=0.0       # option relevant for MMI
+
+criterion=smbr
+drop_frames=false #  option relevant for MMI
+one_silence_class=true # option relevant for MPE/SMBR
+num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
+                   # will interact with the learning rates (if you decrease
+                   # this, you'll have to decrease the learning rate, and vice
+                   # versa).
+regularization_opts=
+minibatch_size=64  # This is the number of examples rather than the number of output frames.
+modify_learning_rates=false
+last_layer_factor=1.0  # relates to modify-learning-rates
+first_layer_factor=1.0 # relates to modify-learning-rates
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+
+
+stage=-3
+
+adjust_priors=true
+num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
+                # using GPUs.
+
+cleanup=true
+keep_model_iters=1
+retroactive=false
+remove_egs=false
+src_model=  # will default to $degs_dir/final.mdl
+
+left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
+right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
+  echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe"
+  echo ""
+  echo "You have to first call get_egs_discriminative2.sh to dump the egs."
+  echo "Caution: the options 'drop-frames' and 'criterion' are taken here"
+  echo "even though they were required also by get_egs_discriminative2.sh,"
+  echo "and they should normally match."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
+  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
+  echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
+  echo "                                                   # this value times num-jobs-nnet."
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
+  echo "                                                   # of egs than this, it will get reduced automatically."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... "
+  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
+  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
+  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
+  echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
+  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
+  echo "                                                   # changes across layers."
+  exit 1;
+fi
+
+degs_dir=$1
+dir=$2
+
+[ -z "$src_model" ] && src_model=$degs_dir/final.mdl
+
+# Check some files.
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log || exit 1;
+
+# copy some things
+for f in splice_opts cmvn_opts tree final.mat; do
+  if [ -f $degs_dir/$f ]; then
+    cp $degs_dir/$f $dir/ || exit 1;
+  fi
+done
+
+silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;
+
+num_archives_priors=0
+if $adjust_priors; then
+  num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1
+fi
+
+frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
+frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)
+
+echo $frame_subsampling_factor > $dir/frame_subsampling_factor
+
+num_archives_expanded=$[$num_archives*$frame_subsampling_factor]
+
+if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
+  echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives_expanded,"
+  echo " ... setting it to $num_archives."
+  num_jobs_nnet=$num_archives_expanded
+fi
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[$num_archives_to_process/$num_jobs_nnet]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+prior_gpu_opt="--use-gpu=no"
+prior_queue_opt=""
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+for e in $(seq 1 $[num_epochs*frame_subsampling_factor]); do
+  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
+  iter_to_epoch[$x]=$e
+done
+
+if [ $stage -le -1 ]; then
+  echo "$0: Copying initial model and modifying preconditioning setup"
+
+  # Note, the baseline model probably had preconditioning, and we'll keep it;
+  # but we want online preconditioning with a larger number of samples of
+  # history, since in this setup the frames are only randomized at the segment
+  # level so they are highly correlated.  It might make sense to tune this a
+  # little, later on, although I doubt it matters once the --num-samples-history
+  # is large enough.
+
+  if [ ! -z "$effective_lrate" ]; then
+    learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
+    echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
+  fi
+
+  $cmd $dir/log/convert.log \
+    nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
+fi
+
+
+rm -f $dir/.error 2>/dev/null || true 
+
+x=0   
+
+deriv_time_opts=
+[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
+[ ! -z "$right_deriv_truncate" ] && \
+  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"
+
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    if $run_diagnostics; then
+      # Set off jobs doing some diagnostics, in the background.  # Use the egs dir from the previous iteration for the diagnostics
+      $cmd $dir/log/compute_objf_valid.$x.log \
+        nnet3-discriminative-compute-objf  $regularization_opts \
+        --silence-phones=$silphonelist \
+        --criterion=$criterion --drop-frames=$drop_frames \
+        --one-silence-class=$one_silence_class \
+        --boost=$boost --acoustic-scale=$acoustic_scale \
+        $dir/$x.mdl \
+        ark:$degs_dir/valid_diagnostic.degs &
+      $cmd $dir/log/compute_objf_train.$x.log \
+        nnet3-discriminative-compute-objf  $regularization_opts \
+        --silence-phones=$silphonelist \
+        --criterion=$criterion --drop-frames=$drop_frames \
+        --one-silence-class=$one_silence_class \
+        --boost=$boost --acoustic-scale=$acoustic_scale \
+        $dir/$x.mdl \
+        ark:$degs_dir/train_diagnostic.degs &
+    fi
+    
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
+
+
+    echo "Training neural net (pass $x)"
+    
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in `seq $num_jobs_nnet`; do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+
+        if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then
+          frame_shift=$[k % frame_subsampling_factor]
+        else
+          frame_shift=$[(k + k/num_archives) % frame_subsampling_factor]
+        fi
+
+        #archive=$[(($n+($x*$num_jobs_nnet))%$num_archives)+1]
+        if $scale_max_param_change; then
+          this_max_param_change=$(perl -e "print ($max_param_change * $num_jobs_nnet);")
+        else
+          this_max_param_change=$max_param_change
+        fi
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-discriminative-train --verbose=2 \
+          --apply-deriv-weights=$apply_deriv_weights \
+          $parallel_train_opts $deriv_time_opts \
+          --max-param-change=$this_max_param_change \
+          --silence-phones=$silphonelist \
+          --criterion=$criterion --drop-frames=$drop_frames \
+          --one-silence-class=$one_silence_class \
+          --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
+          $dir/$x.mdl \
+          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    [ -f $dir/.error ] && { echo "Found $dir/.error. See $dir/log/train.$x.*.log"; exit 1; }
+
+    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.raw; done)
+
+    # below use run.pl instead of a generic $cmd for these very quick stages,
+    # so that we don't run the risk of waiting for a possibly hard-to-get GPU.
+    run.pl $dir/log/average.$x.log \
+      nnet3-average $nnets_list - \| \
+      nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+
+    if $modify_learning_rates; then
+      run.pl $dir/log/modify_learning_rates.$x.log \
+        nnet3-modify-learning-rates --retroactive=$retroactive \
+        --last-layer-factor=$last_layer_factor \
+        --first-layer-factor=$first_layer_factor \
+        "nnet3-am-copy --raw $dir/$x.mdl -|" "nnet3-am-copy --raw $dir/$[$x+1].mdl -|" - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+  
+    if [ ! -z "${iter_to_epoch[$x]}" ]; then
+      e=${iter_to_epoch[$x]}
+      ln -sf $x.mdl $dir/epoch$e.mdl
+    fi
+
+    if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then
+      if [ ! -f $degs_dir/priors_egs.1.ark ]; then
+        echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
+        echo "$0: Run this script with --adjust-priors false to not adjust priors"
+        exit 1
+      fi
+      (
+      e=${iter_to_epoch[$x]}
+      rm -f $dir/.error 2> /dev/null || true
+
+      steps/nnet3/adjust_priors.sh --egs-type priors_egs \
+        --num-jobs-compute-prior $num_archives_priors \
+        --cmd "$cmd $prior_queue_opt" --use-gpu false \
+        --raw false --iter epoch$e $dir $degs_dir \
+        || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
+      ) &
+    fi
+
+    [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; }
+  fi
+
+  x=$[$x+1]
+  num_archives_processed=$[num_archives_processed+num_jobs_nnet]
+done
+
+rm -f $dir/final.mdl 2>/dev/null || true
+cp $dir/$x.mdl $dir/final.mdl
+ln -sf final.mdl $dir/epoch$[num_epochs*frame_subsampling_factor].mdl
+
+if $adjust_priors && [ $stage -le $num_iters ]; then
+  if [ ! -f $degs_dir/priors_egs.1.ark ]; then
+    echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
+    echo "$0: Run this script with --adjust-priors false to not adjust priors"
+    exit 1
+  fi
+
+  steps/nnet3/adjust_priors.sh --egs-type priors_egs \
+    --num-jobs-compute-prior $num_archives_priors \
+    --cmd "$cmd $prior_queue_opt" --use-gpu false \
+    --raw false --iter epoch$[num_epochs*frame_subsampling_factor] \
+    $dir $degs_dir || exit 1
+fi
+
+echo Done
+
+#epoch_final_iters=
+#for e in $(seq 0 $num_epochs); do
+#  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
+#  #ln -sf $x.mdl $dir/epoch$e.mdl
+#  epoch_final_iters="$epoch_final_iters $x"
+#done
+
+
+# function to remove egs that might be soft links.
+remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
+
+if $cleanup && $remove_egs; then  # note: this is false by default.
+  echo Removing training examples
+  remove $degs_dir/degs.*
+  remove $degs_dir/priors_egs.*
+fi
+
+
+if $cleanup; then
+  echo Removing most of the models
+  for x in `seq 1 $keep_model_iters $num_iters`; do
+    if [ -z "${iter_to_epoch[$x]}" ]; then
+      # if $x is not an epoch-final iteration..
+      rm $dir/$x.mdl 2>/dev/null
+    fi
+  done
+fi
+
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index e17026e496f..d8ac11da720 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -93,6 +93,7 @@ echo -n >$ieconf
 cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
 echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
 for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
+echo "--ivector-period=$ivector_period" >>$ieconf
 echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
 echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
 echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh
index 8a2aee9d48d..202208c1f5f 100755
--- a/egs/wsj/s5/steps/score_kaldi.sh
+++ b/egs/wsj/s5/steps/score_kaldi.sh
@@ -14,6 +14,7 @@ beam=6
 word_ins_penalty=0.0,0.5,1.0
 min_lmwt=9
 max_lmwt=20
+iter=final
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
diff --git a/src/Makefile b/src/Makefile
index 57a4b98e0aa..c8d2e401866 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -170,7 +170,7 @@ cudamatrix: base util matrix
 nnet: base util matrix cudamatrix
 nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain
-chain: lat hmm tree fstext matrix cudamatrix util base
+chain: lat hmm tree fstext matrix cudamatrix util base 
 ivector: base util matrix thread transform tree gmm
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread
diff --git a/src/base/Makefile b/src/base/Makefile
index 8db3b86d021..88be1b96c9a 100644
--- a/src/base/Makefile
+++ b/src/base/Makefile
@@ -5,7 +5,7 @@ include ../kaldi.mk
 
 TESTFILES = kaldi-math-test io-funcs-test kaldi-error-test timer-test
 
-OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o
+OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o kaldi-types-extra.o
 
 LIBNAME = kaldi-base
 
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index 9629c5466ad..9311645cc0c 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -3,6 +3,7 @@
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
 //                      Jan Silovsky;   Yanmin Qian;
 //                      Johns Hopkins University (Author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -87,6 +88,112 @@ template<class T> inline void ReadBasicType(std::istream &is,
   }
 }
 
+// Template that covers integers.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v) {
+  // Compile time assertion that this is not called with a wrong type.
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  if (binary) {
+    char sz = sizeof(T);  // this is currently just a check.
+    os.write(&sz, 1);
+    int32 vecsz = static_cast<int32>(v.size());
+    KALDI_ASSERT((size_t)vecsz == v.size());
+    os.write(reinterpret_cast<const char *>(&vecsz), sizeof(vecsz));
+    if (vecsz != 0) {
+      os.write(reinterpret_cast<const char *>(&(v[0])), sizeof(T) * vecsz * 2);
+    }
+  } else {
+    // focus here is on prettiness of text form rather than
+    // efficiency of reading-in.
+    // reading-in is dominated by low-level operations anyway:
+    // for efficiency use binary.
+    os << "[ ";
+    typename std::vector<std::pair<T, T> >::const_iterator iter = v.begin(),
+                                                            end = v.end();
+    for (; iter != end; ++iter) {
+      if (sizeof(T) == 1)
+        os << static_cast<int16>(iter->first) << ','
+           << static_cast<int16>(iter->second) << ' ';
+      else
+        os << iter->first << ','
+           << iter->second << ' ';
+    }
+    os << "]\n";
+  }
+  if (os.fail()) {
+    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+  }
+}
+
+// Template that covers integers.
+template<class T> 
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v) {
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  KALDI_ASSERT(v != NULL);
+  if (binary) {
+    int sz = is.peek();
+    if (sz == sizeof(T)) {
+      is.get();
+    } else {  // this is currently just a check.
+      KALDI_ERR << "ReadIntegerPairVector: expected to see type of size "
+                << sizeof(T) << ", saw instead " << sz << ", at file position "
+                << is.tellg();
+    }
+    int32 vecsz;
+    is.read(reinterpret_cast<char *>(&vecsz), sizeof(vecsz));
+    if (is.fail() || vecsz < 0) goto bad;
+    v->resize(vecsz);
+    if (vecsz > 0) {
+      is.read(reinterpret_cast<char *>(&((*v)[0])), sizeof(T)*vecsz*2);
+    }
+  } else {
+    std::vector<std::pair<T, T> > tmp_v;  // use temporary so v doesn't use extra memory
+                           // due to resizing.
+    is >> std::ws;
+    if (is.peek() != static_cast<int>('[')) {
+      KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw "
+                << is.peek() << ", at file position " << is.tellg();
+    }
+    is.get();  // consume the '['.
+    is >> std::ws;  // consume whitespace.
+    while (is.peek() != static_cast<int>(']')) {
+      if (sizeof(T) == 1) {  // read/write chars as numbers.
+        int16 next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(',')) 
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::make_pair<T, T>((T)next_t1, (T)next_t2));
+      } else {
+        T next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(',')) 
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::make_pair<T, T>((T)next_t1, (T)next_t2));
+      }
+    }
+    is.get();  // get the final ']'.
+    *v = tmp_v;  // could use std::swap to use less temporary memory, but this
+    // uses less permanent memory.
+  }
+  if (!is.fail()) return;
+ bad:
+  KALDI_ERR << "ReadIntegerPairVector: read failure at file position "
+            << is.tellg();
+}
 
 template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
                                                  const std::vector<T> &v) {
diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc
index 63506073ff8..dd05326d5ed 100644
--- a/src/base/io-funcs-test.cc
+++ b/src/base/io-funcs-test.cc
@@ -43,8 +43,20 @@ void UnitTestIo(bool binary) {
     WriteIntegerVector(outfile, binary, vec2);
     if (!binary) outfile << " \n";
     std::vector<char> vec3;
-    for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100);
+
+    int32 size = RandInt(0, 10);
+    for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100);
     WriteIntegerVector(outfile, binary, vec3);
+    std::vector<std::pair<int32, int32> > vec4;
+    WriteIntegerPairVector(outfile, binary, vec4);
+    if (!binary && Rand()%2 == 0) outfile << " \n";
+    std::vector<std::pair<uint16, uint16> > vec5;
+    for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair<uint16, uint16>(Rand()%100 - 10, Rand()%100 - 10));
+    WriteIntegerPairVector(outfile, binary, vec5);
+    if (!binary) outfile << " \n";
+    std::vector<std::pair<char, char> > vec6;
+    for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair<char, char>(Rand()%100, Rand()%100));
+    WriteIntegerPairVector(outfile, binary, vec6);
     if (!binary && Rand()%2 == 0) outfile << " \n";
     const char *token1 = "Hi";
     WriteToken(outfile, binary, token1);
@@ -90,6 +102,15 @@ void UnitTestIo(bool binary) {
       std::vector<char> vec3_in;
       ReadIntegerVector(infile, binary_in, &vec3_in);
       KALDI_ASSERT(vec3_in == vec3);
+      std::vector<std::pair<int32, int32> > vec4_in;
+      ReadIntegerPairVector(infile, binary_in, &vec4_in);
+      KALDI_ASSERT(vec4_in == vec4);
+      std::vector<std::pair<uint16, uint16> > vec5_in;
+      ReadIntegerPairVector(infile, binary_in, &vec5_in);
+      KALDI_ASSERT(vec5_in == vec5);
+      std::vector<std::pair<char, char> > vec6_in;
+      ReadIntegerPairVector(infile, binary_in, &vec6_in);
+      KALDI_ASSERT(vec6_in == vec6);
       std::string  token1_in, token2_in;
       KALDI_ASSERT(Peek(infile, binary_in) == static_cast<int>(*token1));
       KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast<int>(*token1));
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index ba0cf1c1c7c..4caddc6b5b3 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
 //                      Jan Silovsky;   Yanmin Qian
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -181,6 +182,16 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
 template<class T> inline void ReadIntegerVector(std::istream &is, bool binary,
                                                 std::vector<T> *v);
 
+/// Function for writing STL vectors of pairs of integer types.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v);
+
+/// Function for reading STL vector of pairs of integer types.
+template<class T> 
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v);
+
 /// The WriteToken functions are for writing nonempty sequences of non-space
 /// characters. They are not for general strings.
 void WriteToken(std::ostream &os, bool binary, const char *token);
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index e28ddcc1a09..ac590a06a25 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -41,20 +41,19 @@
 #endif
 
 #ifndef M_PI
-#  define M_PI 3.1415926535897932384626433832795
+#define M_PI 3.1415926535897932384626433832795
 #endif
 
 #ifndef M_SQRT2
-#  define M_SQRT2 1.4142135623730950488016887
+#define M_SQRT2 1.4142135623730950488016887
 #endif
 
-
 #ifndef M_2PI
-#  define M_2PI 6.283185307179586476925286766559005
+#define M_2PI 6.283185307179586476925286766559005
 #endif
 
 #ifndef M_SQRT1_2
-# define M_SQRT1_2 0.7071067811865475244008443621048490
+#define M_SQRT1_2 0.7071067811865475244008443621048490
 #endif
 
 #ifndef M_LOG_2PI
@@ -65,6 +64,11 @@
 #define M_LN2 0.693147180559945309417232121458
 #endif
 
+#ifndef M_LN10
+#define M_LN10 2.302585092994045684017991454684
+#endif
+
+
 #define KALDI_ISNAN std::isnan
 #define KALDI_ISINF std::isinf
 #define KALDI_ISFINITE(x) std::isfinite(x)
diff --git a/src/base/kaldi-types-extra.cc b/src/base/kaldi-types-extra.cc
new file mode 100644
index 00000000000..f7f67a19fb4
--- /dev/null
+++ b/src/base/kaldi-types-extra.cc
@@ -0,0 +1,268 @@
+// base/kaldi-types-extra.cc
+
+// Copyright 2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-math.h"
+#include "base/kaldi-types-extra.h"
+#include "base/kaldi-types.h"
+
+namespace kaldi {
+
+template<typename Real>
+void SignedLogReal<Real>::SetZero() {
+  sign_ = false;
+  log_f_ = kLogZeroDouble;
+}
+
+template<typename Real>
+void SignedLogReal<Real>::SetOne() {
+  sign_ = false;
+  log_f_ = 0.0;
+}
+
+template<typename Real>
+void SignedLogReal<Real>::Set(Real f) {
+  if (f < 0.0) {
+    sign_ = true;
+    log_f_ = static_cast<Real>(kaldi::Log(static_cast<double>(-f)));
+  } else {
+    sign_ = false;
+    log_f_ = static_cast<Real>(kaldi::Log(static_cast<double>(f)));
+  }
+}
+
+template<typename Real>
+void SignedLogReal<Real>::SetRandn() {
+  Set(kaldi::RandGauss());
+}
+
+template<typename Real>
+void SignedLogReal<Real>::SetRandUniform() {
+  Set(kaldi::RandUniform());
+}
+
+template<typename Real>
+void SignedLogReal<Real>::Log() {
+  KALDI_ASSERT(Positive());
+  log_f_ = kaldi::Log(log_f_);
+}
+
+template<typename Real>
+bool SignedLogReal<Real>::IsZero(Real cutoff) const {
+  return (log_f_ < kaldi::Log(cutoff)); 
+}
+
+template<typename Real>
+bool SignedLogReal<Real>::IsOne(Real cutoff) const {
+  return ( Positive() && (log_f_ > 0 ? LogSub(log_f_, 0) : LogSub(0, log_f_)) < kaldi::Log(cutoff) );
+}
+
+template<typename Real>
+bool SignedLogReal<Real>::ApproxEqual(const SignedLogReal<Real> &other, float tol) const {
+
+  if (Sign() == other.sign_) {
+    double tmp1 = log_f_;
+    double tmp2 = other.LogMagnitude();
+    if (tmp1 >= tmp2) {
+      return (LogSub(tmp1, tmp2) <= kaldi::Log(tol) + tmp1);
+    } else {
+      return (LogSub(tmp2, tmp1) <= kaldi::Log(tol) + tmp1);
+    }
+  } 
+
+  return (LogAdd(log_f_, other.LogMagnitude() <= kaldi::Log(tol) + log_f_));
+}
+
+template<typename Real>
+bool SignedLogReal<Real>::Equal(const SignedLogReal<Real> &other) const {
+  return (sign_ == other.sign_ && log_f_ == other.log_f_);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::Add(const SignedLogReal<OtherReal> &a) {
+  if (sign_ == a.Sign()) {
+    log_f_ = LogAdd(log_f_, a.LogMagnitude());
+  } else {
+    if (log_f_ < a.LogMagnitude()) {
+      sign_ = !sign_;
+      log_f_ = LogSub(a.LogMagnitude(), log_f_);
+    } else {
+      log_f_ = LogSub(log_f_, a.LogMagnitude());
+    }
+  }
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::AddReal(OtherReal f) {
+  SignedLogReal<OtherReal> temp(f);
+  Add(temp);
+}
+    
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::AddLogReal(OtherReal log_f) {
+  SignedLogReal<OtherReal> temp(false, log_f);
+  Add(temp);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::AddMultiplyLogReal(const SignedLogReal<OtherReal> &a, 
+    OtherReal log_b) {
+  SignedLogReal<OtherReal> temp(false, log_b);
+  temp.Multiply(a);
+  Add(temp);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::Sub(const SignedLogReal<OtherReal> &a) {
+  if (sign_ == a.Sign()) {
+    if (log_f_ < a.LogMagnitude()) {
+      sign_ = !sign_;
+      log_f_ = LogSub(a.LogMagnitude(), log_f_);
+    } else {
+      log_f_ = LogSub(log_f_, a.LogMagnitude());
+    }
+  } else {
+    log_f_ = LogAdd(log_f_, a.LogMagnitude());
+  }
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::SubMultiplyLogReal(const SignedLogReal<OtherReal> &a, 
+    OtherReal log_b) {
+  SignedLogReal<OtherReal> temp(false, log_b);
+  temp.Multiply(a);
+  Sub(temp);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::Multiply(const SignedLogReal<OtherReal> &a) {
+  if (sign_ != a.Sign()) { sign_ = true; }
+  else { sign_ = false; }
+
+  log_f_ += a.LogMagnitude();
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::MultiplyReal(OtherReal f) {
+  SignedLogReal<OtherReal> temp(f);
+  Multiply(temp);
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::MultiplyLogReal(OtherReal log_f) {
+  log_f_ += log_f;
+}
+
+template<typename Real>
+template<typename OtherReal>
+void SignedLogReal<Real>::DivideBy(const SignedLogReal<OtherReal> &a) {
+  if (sign_ != a.Sign()) { sign_ = true; }
+  else { sign_ = false; }
+
+  log_f_ -= a.LogMagnitude();
+}
+    
+template<typename Real>
+SignedLogReal<Real> SignedLogReal<Real>::operator+(const SignedLogReal<Real> &a) const {
+  SignedLogReal<Real> tmp(*this);
+  tmp.Add(a);
+  return tmp;
+}
+
+template<typename Real>
+SignedLogReal<Real> SignedLogReal<Real>::operator*(const SignedLogReal<Real> &a) const {
+  SignedLogReal<Real> tmp(*this);
+  tmp.Multiply(a);
+  return tmp;
+}
+
+template<typename Real>
+SignedLogReal<Real> SignedLogReal<Real>::operator/(const SignedLogReal<Real> &a) const {
+  SignedLogReal<Real> tmp(*this);
+  tmp.DivideBy(a);
+  return tmp;
+}
+
+template<typename Real>
+SignedLogReal<Real> operator-(const SignedLogReal<Real> &a) {
+  SignedLogReal<Real> tmp(a);
+  tmp.Negate();
+  return tmp;
+}
+
+template<typename Real>
+SignedLogReal<Real> SignedLogReal<Real>::operator-(const SignedLogReal<Real> &a) const {
+  SignedLogReal<Real> tmp(*this);
+  tmp.Sub(a);
+  return tmp;
+}
+
+template void SignedLogReal<double>::Add(const SignedLogReal<double> &a);
+template void SignedLogReal<float>::Add(const SignedLogReal<float> &);
+template void SignedLogReal<double>::AddReal(double f);
+template void SignedLogReal<float>::AddReal(float f);
+template void SignedLogReal<double>::AddLogReal(double f);
+template void SignedLogReal<float>::AddLogReal(float f);
+template void SignedLogReal<double>::AddMultiplyLogReal(const SignedLogReal<double> &a, double log_b);
+template void SignedLogReal<float>::AddMultiplyLogReal(const SignedLogReal<float> &a, float log_b);
+template void SignedLogReal<double>::Sub(const SignedLogReal<double> &a);
+template void SignedLogReal<float>::Sub(const SignedLogReal<float> &);
+template void SignedLogReal<double>::SubMultiplyLogReal(const SignedLogReal<double> &a, double log_b);
+template void SignedLogReal<float>::SubMultiplyLogReal(const SignedLogReal<float> &a, float log_b);
+template void SignedLogReal<double>::Multiply(const SignedLogReal<double> &a);
+template void SignedLogReal<float>::Multiply(const SignedLogReal<float> &a);
+template void SignedLogReal<double>::MultiplyReal(double f);
+template void SignedLogReal<float>::MultiplyReal(float f);
+template void SignedLogReal<double>::MultiplyLogReal(double f);
+template void SignedLogReal<float>::MultiplyLogReal(float f);
+template void SignedLogReal<double>::DivideBy(const SignedLogReal<double> &a);
+template void SignedLogReal<float>::DivideBy(const SignedLogReal<float> &a);
+
+template SignedLogReal<double> SignedLogReal<double>::operator+(const SignedLogReal<double> &a) const;
+template SignedLogReal<double> SignedLogReal<double>::operator*(const SignedLogReal<double> &a) const ;
+template SignedLogReal<double> SignedLogReal<double>::operator/(const SignedLogReal<double> &a) const;
+template SignedLogReal<double> SignedLogReal<double>::operator-(const SignedLogReal<double> &a) const;
+template SignedLogReal<double> operator-(const SignedLogReal<double> &a);
+
+template SignedLogReal<double>::SignedLogReal(double f);
+template SignedLogReal<double>::SignedLogReal(float f);
+template SignedLogReal<float>::SignedLogReal(double f);
+template SignedLogReal<float>::SignedLogReal(float f);
+
+template SignedLogReal<double>::SignedLogReal(bool s, double);
+template SignedLogReal<double>::SignedLogReal(bool s, float);
+template SignedLogReal<float>::SignedLogReal(bool s, double);
+template SignedLogReal<float>::SignedLogReal(bool s, float);
+
+template SignedLogReal<double>::SignedLogReal(const SignedLogReal<double> &);
+template SignedLogReal<double>::SignedLogReal(const SignedLogReal<float> &);
+template SignedLogReal<float>::SignedLogReal(const SignedLogReal<double> &);
+template SignedLogReal<float>::SignedLogReal(const SignedLogReal<float> &);
+
+template class SignedLogReal<double>;
+
+} // namespace kaldi
diff --git a/src/base/kaldi-types-extra.h b/src/base/kaldi-types-extra.h
new file mode 100644
index 00000000000..6d1a4656a3e
--- /dev/null
+++ b/src/base/kaldi-types-extra.h
@@ -0,0 +1,171 @@
+// base/kaldi-types-extra.h
+
+// Copyright 2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_BASE_KALDI_TYPES_EXTRA_H_
+#define KALDI_BASE_KALDI_TYPES_EXTRA_H_ 1
+
+#include "base/kaldi-math.h"
+#include "base/kaldi-types.h"
+#include "base/kaldi-common.h"
+
+namespace kaldi {
+
+template<typename Real>
+class SignedLogReal {
+  public:
+    /// Returns the sign of the real number
+    inline bool Sign() const { return sign_; }
+    inline bool Positive() const { return (!sign_); }
+    inline bool Negative() const { return sign_; }
+
+    /// Returns the log magnitude of the real number
+    inline Real LogMagnitude() const { return log_f_; }
+
+    /// Returns the real number in double
+    inline Real Value() const { return
+      static_cast<Real>(Exp(static_cast<double>(log_f_)) * 
+          (sign_ ? -1.0 : 1.0)); }
+
+    /*    Basic setting-to-special values functions.    */
+    
+    /// Sets the value to zero
+    void SetZero();
+
+    /// Sets the number to particular value
+    void Set(Real);
+
+    /// Sets the number to one
+    void SetOne();
+
+    /// Sets the number to random value from normal distribution
+    void SetRandn();
+
+    /// Sets the number to uniformly distributed on (0,1)
+    void SetRandUniform();
+
+    /* Various special functions. */
+    
+    void Negate() { sign_ = !sign_; };
+
+    /// Apply log to the value if the number is positive
+    /// or exit with error.
+    void Log();
+    
+    /// returns true if the number is zero
+    bool IsZero(Real cutoff = 1.0e-40) const;
+
+    /// returns true if the number is one 
+    bool IsOne(Real cutoff = 1.0e-06) const;
+
+    /// Returns true if this - other <= tol * this
+    bool ApproxEqual(const SignedLogReal<Real> &other, float tol = 0.01) const;
+
+    /// Tests for exact equality
+    bool Equal(const SignedLogReal<Real> &other) const;
+
+    /// Add another object of same type
+    template<typename OtherReal> void Add(const SignedLogReal<OtherReal> &a); 
+
+    /// Add a real number
+    template<typename OtherReal> void AddReal(OtherReal f);
+
+    /// Add log real number
+    template<typename OtherReal> void AddLogReal(OtherReal log_f);
+    
+    /// Add SignedLogReal multiplied by real number in log
+    template<typename OtherReal> void AddMultiplyLogReal(const SignedLogReal<OtherReal> &a, OtherReal log_f);
+    
+    /// Subtract another object of same type
+    template<typename OtherReal> void Sub(const SignedLogReal<OtherReal> &a); 
+    
+    /// Subtract SignedLogReal multiplied by real number in log
+    template<typename OtherReal> void SubMultiplyLogReal(const SignedLogReal<OtherReal> &a, OtherReal log_f);
+
+    /// Multiply by another object of same type
+    template<typename OtherReal> void Multiply(const SignedLogReal<OtherReal> &a);
+
+    /// Multiply by real number
+    template<typename OtherReal> void MultiplyReal(OtherReal f);
+    
+    /// Multiply by log real number
+    template<typename OtherReal> void MultiplyLogReal(OtherReal log_f);
+    
+    /// DivideBy another object of same type
+    template<typename OtherReal> void DivideBy(const SignedLogReal<OtherReal> &a);
+
+    /// Operators
+    SignedLogReal<Real> operator+(const SignedLogReal<Real> &a) const;
+    SignedLogReal<Real> operator*(const SignedLogReal<Real> &a) const;
+    SignedLogReal<Real> operator/(const SignedLogReal<Real> &a) const;
+    SignedLogReal<Real> operator-(const SignedLogReal<Real> &a) const;
+
+    /// Initializer, callable only from child.
+    /// Default initializer
+    explicit SignedLogReal() :
+      sign_(false), log_f_(kLogZeroDouble) { 
+      KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+    }
+  
+    /// Initialize from a real number
+    template<typename OtherReal>
+    explicit SignedLogReal(OtherReal f) { 
+      KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+      KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal);
+      if (f < 0.0) {
+        sign_ = true;
+        log_f_ = static_cast<Real>(kaldi::Log(static_cast<double>(-f)));
+      } else {
+        sign_ = false;
+        log_f_ = static_cast<Real>(kaldi::Log(static_cast<double>(f)));
+      }
+    }
+
+    /// Initialize from sign and log real number
+    template<typename OtherReal>
+    explicit SignedLogReal(bool sign, OtherReal log_f) :
+      sign_(sign), log_f_(log_f) {
+      KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+      KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal);
+    }
+
+    /// Initialize from object
+    template<typename OtherReal>
+    explicit SignedLogReal(const SignedLogReal<OtherReal> &a) :
+      sign_(a.Sign()), log_f_(a.LogMagnitude()) {
+      KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+      KALDI_ASSERT_IS_FLOATING_TYPE(OtherReal);
+    }
+
+  private:
+    bool sign_;
+    Real log_f_;
+};
+
+template<typename Real>
+inline std::ostream & operator << (std::ostream & os, const SignedLogReal<Real> &a) {
+  os << (a.Negative() ? "-" : "") << "1.0 * Exp(" << a.LogMagnitude() << ")";
+  return os;
+}
+
+template<typename Real>
+SignedLogReal<Real> operator-(const SignedLogReal<Real> &a);
+
+} // namespace kaldi
+
+#endif  // KALDI_BASE_KALDI_TYPES_EXTRA_H_
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 20f58d52b7d..42404e38384 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -101,7 +101,8 @@ int32 TypeOneUsage(const ParseOptions &po) {
 }
 
 int32 TypeTwoUsage(const ParseOptions &po,
-                   bool binary) {
+                   bool binary,
+                   bool average = false) {
   KALDI_ASSERT(po.NumArgs() == 2);
   KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
                "vector-sum: first argument must be an rspecifier");
@@ -133,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po,
       }
     }
   }
+  
+  if (num_done > 0 && average) sum.Scale(1.0 / num_done);
 
   Vector<BaseFloat> sum_float(sum);
   WriteKaldiObject(sum_float, po.GetArg(2), binary);
@@ -199,12 +202,13 @@ int main(int argc, char *argv[]) {
         " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n"
         "See also: copy-vector, dot-weights\n";
         
-    bool binary;
+    bool binary, average = false;
     
     ParseOptions po(usage);
 
     po.Register("binary", &binary, "If true, write output as binary (only "
                 "relevant for usage types two or three");
+    po.Register("average", &average, "Do average instead of sum");
     
     po.Read(argc, argv);
 
@@ -219,7 +223,7 @@ int main(int argc, char *argv[]) {
                ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
                kNoWspecifier) {
       // input from a single table, output not to table.
-      exit_status = TypeTwoUsage(po, binary);
+      exit_status = TypeTwoUsage(po, binary, average);
     } else if (po.NumArgs() >= 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier &&
                ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == 
diff --git a/src/chain/Makefile b/src/chain/Makefile
index e24913c06f2..c02844767f8 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -12,7 +12,7 @@ OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
           language-model.o chain-denominator.o chain-training.o
 
 ifeq ($(CUDA), true)
-  OBJFILES += chain-kernels.o 
+  OBJFILES += chain-kernels.o
 endif
 
 LIBNAME = kaldi-chain
@@ -53,7 +53,7 @@ endif
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
            ../fstext/kaldi-fstext.a \
-           ../matrix/kaldi-matrix.a ../cudamatrix/kaldi-cudamatrix.a \
+           ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
           ../util/kaldi-util.a ../base/kaldi-base.a
 
 
diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h
index 7ea58038918..52e388a3f2e 100644
--- a/src/chain/chain-datastruct.h
+++ b/src/chain/chain-datastruct.h
@@ -45,7 +45,8 @@ extern "C" {
   };
 
 
-
+  // Search for this in chain-kernels.cu for an explanation.
+  enum { kThresholdingPowerOfTwo = 14 };
 
 }
 
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index a654ad7d05f..ceb61a550f0 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -139,77 +139,6 @@ void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) {
 
   Vector<BaseFloat> avg_prob_float(avg_prob);
   initial_probs_ = avg_prob_float;
-  special_hmm_state_ = ComputeSpecialState(fst, avg_prob_float);
-}
-
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state) {
-  int32 num_states = fst.NumStates(),
-      num_states_can_reach = 0;
-  KALDI_ASSERT(dest_state >= 0 && dest_state < num_states);
-  std::vector<bool> can_reach(num_states, false);
-  std::vector<std::vector<int32> > reverse_transitions(num_states);
-  for (int32 s = 0; s < num_states; s++)
-    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
-         aiter.Next())
-      reverse_transitions[aiter.Value().nextstate].push_back(s);
-  std::vector<int32> queue;
-  can_reach[dest_state] = true;
-  queue.push_back(dest_state);
-  num_states_can_reach++;
-  while (!queue.empty()) {
-    int32 state = queue.back();
-    queue.pop_back();
-    std::vector<int32>::const_iterator iter = reverse_transitions[state].begin(),
-        end = reverse_transitions[state].end();
-    for (; iter != end; ++iter) {
-      int32 prev_state = *iter;
-      if (!can_reach[prev_state]) {
-        can_reach[prev_state] = true;
-        queue.push_back(prev_state);
-        num_states_can_reach++;
-      }
-    }
-  }
-  KALDI_ASSERT(num_states_can_reach >= 1 &&
-               num_states_can_reach <= num_states);
-  return num_states_can_reach;
-}
-
-
-int32 DenominatorGraph::ComputeSpecialState(
-    const fst::StdVectorFst &fst,
-    const Vector<BaseFloat> &initial_probs) {
-  int32 num_states = initial_probs.Dim();
-  std::vector<std::pair<BaseFloat, int32> > pairs(num_states);
-  for (int32 i = 0; i < num_states; i++)
-    pairs.push_back(std::pair<BaseFloat, int32>(-initial_probs(i), i));
-  // the first element of each pair is the negative of the initial-prob,
-  // so when we sort, the highest initial-prob will be first.
-  std::sort(pairs.begin(), pairs.end());
-  // this threshold of 0.75 is pretty arbitrary.  We reject any
-  // state if it can't be reached by 75% of all other states.
-  // In practice we think that states will either be reachable by
-  // almost-all states, or almost-none (e.g. states that are active
-  // only at utterance-beginning), so this threshold shouldn't
-  // be too critical.
-  int32 min_states_can_reach = 0.75 * num_states;
-  for (int32 i = 0; i < num_states; i++) {
-    int32 state = pairs[i].second;
-    int32 n = NumStatesThatCanReach(fst, state);
-    if (n < min_states_can_reach) {
-      KALDI_WARN << "Rejecting state " << state << " as a 'special' HMM state "
-                 << "(for renormalization in fwd-bkwd), because it's only "
-                 << "reachable by " << n << " out of " << num_states
-                 << " states.";
-    } else {
-      return state;
-    }
-  }
-  KALDI_ERR << "Found no states that are reachable by at least "
-            << min_states_can_reach << " out of " << num_states
-            << " states.  This is unexpected.  Change the threshold";
-  return -1;
 }
 
 void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
@@ -261,6 +190,34 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
   fst::Decode(fst, encoder);
 }
 
+// This static function, used in CreateDenominatorFst, sorts an
+// fst's states in decreasing order of number of transitions (into + out of)
+// the state.  The aim is to have states that have a lot of transitions
+// either into them or out of them, be numbered earlier, so hopefully
+// they will be scheduled first and won't delay the computation
+static void SortOnTransitionCount(fst::StdVectorFst *fst) {
+  // negative_num_transitions[i] will contain (before sorting), the pair
+  // ( -(num-transitions-into(i) + num-transition-out-of(i)), i)
+  int32 num_states = fst->NumStates();
+  std::vector<std::pair<int32, int32> > negative_num_transitions(num_states);
+  for (int32 i = 0; i < num_states; i++) {
+    negative_num_transitions[i].first = 0;
+    negative_num_transitions[i].second = i;
+  }
+  for (int32 i = 0; i < num_states; i++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(*fst, i); !aiter.Done();
+         aiter.Next()) {
+      negative_num_transitions[i].first--;
+      negative_num_transitions[aiter.Value().nextstate].first--;
+    }
+  }
+  std::sort(negative_num_transitions.begin(), negative_num_transitions.end());
+  std::vector<fst::StdArc::StateId> order(num_states);
+  for (int32 i = 0; i < num_states; i++)
+    order[negative_num_transitions[i].second] = i;
+  fst::StateSort(fst, order);
+}
+
 void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
   for (int32 i = 1; i <= 3; i++) {
     fst::PushSpecial(fst, fst::kDelta * 0.01);
@@ -414,6 +371,8 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
 
   DenGraphMinimizeWrapper(&transition_id_fst);
 
+  SortOnTransitionCount(&transition_id_fst);
+
   *den_fst = transition_id_fst;
   CheckDenominatorFst(trans_model.NumPdfs(), *den_fst);
   PrintDenGraphStats(*den_fst);
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
index 8e5ee39e4bd..b2510651f39 100644
--- a/src/chain/chain-den-graph.h
+++ b/src/chain/chain-den-graph.h
@@ -88,13 +88,6 @@ class DenominatorGraph {
   // Note: we renormalize each HMM-state to sum to one before doing this.
   const CuVector<BaseFloat> &InitialProbs() const;
 
-  // returns the index of the HMM-state that has the highest value in
-  // InitialProbs (and which we believe will always be reachable from most other
-  // states... later on we may check this more carefully [TODO]).
-  // It's used in getting the 'arbitrary_scale' value to keep the alphas
-  // in a good dynamic range.
-  int32 SpecialHmmState() const { return special_hmm_state_; }
-
   // This function outputs a modifified version of the FST that was used to
   // build this object, that has an initial-state with epsilon transitions to
   // each state, with weight determined by initial_probs_; and has each original
@@ -116,23 +109,15 @@ class DenominatorGraph {
   // functions called from the constructor
   void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds);
 
-  // work out the initial-probs and the 'special state'
-  // Note, there are no final-probs; we treat all states as final
-  // with probability one [we have a justification for this..
-  // assuming it's roughly a well-normalized HMM, this makes sense;
-  // note that we train on chunks, so the beginning and end of a chunk
-  // appear at arbitrary points in the sequence.
-  // At both beginning and end of the chunk, we limit ourselves to
-  // only those pdf-ids that were allowed in the numerator sequence.
+  // work out the initial-probs.  Note, there are no final-probs; we treat all
+  // states as final with probability one [we have a justification for this..
+  // assuming it's roughly a well-normalized HMM, this makes sense; note that we
+  // train on chunks, so the beginning and end of a chunk appear at arbitrary
+  // points in the sequence.  At both beginning and end of the chunk, we limit
+  // ourselves to only those pdf-ids that were allowed in the numerator
+  // sequence.
   void SetInitialProbs(const fst::StdVectorFst &fst);
 
-  // return a suitable 'special' HMM-state used for normalizing probabilities in
-  // the forward-backward.  It has to have a reasonably high probability and be
-  // reachable from most of the graph.  returns a suitable state-index
-  // that we can set special_hmm_state_ to.
-  int32 ComputeSpecialState(const fst::StdVectorFst &fst,
-                            const Vector<BaseFloat> &initial_probs);
-
   // forward_transitions_ is an array, indexed by hmm-state index,
   // of start and end indexes into the transition_ array, which
   // give us the set of transitions out of this state.
@@ -152,23 +137,9 @@ class DenominatorGraph {
   // distribution of the HMM.  This isn't too critical.
   CuVector<BaseFloat> initial_probs_;
 
-  // The index of a somewhat arbitrarily chosen HMM-state that we
-  // use for adjusting the alpha probabilities.  It needs to be
-  // one that is reachable from all states (i.e. not a special
-  // state that's only reachable at sentence-start).  We choose
-  // whichever one has the greatest initial-prob.  It's set
-  // in SetInitialProbs().
-  int32 special_hmm_state_;
-
   int32 num_pdfs_;
 };
 
-// returns the number of states from which there is a path to
-// 'dest_state'.  Utility function used in selecting 'special' state
-// for normalization of probabilities.
-int32 NumStatesThatCanReach(const fst::StdVectorFst &fst,
-                            int32 dest_state);
-
 
 // Function that does acceptor minimization without weight pushing...
 // this is useful when constructing the denominator graph.
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index eaee850a999..258c33cd465 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -39,12 +39,23 @@ DenominatorComputation::DenominatorComputation(
         std::min<int32>(exp_nnet_output_transposed_.NumCols(),
                         static_cast<int32>(kMaxDerivTimeSteps) *
                         num_sequences_)),
-    alpha_(frames_per_sequence_ + 1, den_graph_.NumStates() * num_sequences_,
+    alpha_(frames_per_sequence_ + 1,
+           den_graph_.NumStates() * num_sequences_ + num_sequences_,
            kUndefined),
-    beta_(2, den_graph_.NumStates() * num_sequences_, kUndefined),
+    beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_,
+          kUndefined),
     tot_prob_(num_sequences_, kUndefined),
     tot_log_prob_(num_sequences_, kUndefined),
-    log_correction_term_(num_sequences_, kUndefined) {
+    log_correction_term_(num_sequences_, kUndefined),
+    ok_(true) {
+  KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
+               opts_.leaky_hmm_coefficient < 1.0);
+  // make sure the alpha sums and beta sums are zeroed.
+  alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
+                  num_sequences_).SetZero();
+  beta_.ColRange(den_graph_.NumStates() * num_sequences_,
+                 num_sequences_).SetZero();
+
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
   exp_nnet_output_transposed_.ApplyExp();
 }
@@ -70,13 +81,12 @@ void DenominatorComputation::AlphaFirstFrame() {
 void DenominatorComputation::AlphaGeneralFrame(int32 t) {
   KALDI_ASSERT(t > 0 && t <= frames_per_sequence_);
   BaseFloat *this_alpha = alpha_.RowData(t);
-  const BaseFloat *prev_alpha = alpha_.RowData(t - 1);
+  const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1);
   const Int32Pair *backward_transitions = den_graph_.BackwardTransitions();
   const DenominatorGraphTransition *transitions = den_graph_.Transitions();
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows(),
       num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = den_graph_.SpecialHmmState();
+      num_sequences = num_sequences_;
 
   // 'probs' is the matrix of pseudo-likelihoods for frame t - 1.
   CuSubMatrix<BaseFloat> probs(exp_nnet_output_transposed_, 0, num_pdfs,
@@ -90,8 +100,8 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
 
     cuda_chain_hmm_forward(dimGrid, dimBlock, backward_transitions, transitions,
-                           num_sequences, special_hmm_state, prob_data,
-                           probs.Stride(), prev_alpha, this_alpha);
+                           num_sequences, prob_data, probs.Stride(),
+                           prev_alpha_dash, this_alpha);
 
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -110,18 +120,19 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
           int32 pdf_id = trans_iter->pdf_id,
               prev_hmm_state = trans_iter->hmm_state;
           BaseFloat prob = prob_data[pdf_id * prob_stride + s],
-              this_prev_alpha = prev_alpha[prev_hmm_state * num_sequences + s];
+              this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s];
           this_tot_alpha += this_prev_alpha * transition_prob * prob;
         }
-        // Let arbitrary_scale be the inverse of the alpha value for the
-        // hmm-state indexed special_hmm_state_ on the previous frame (for this
-        // sequence); we multiply this into all the transition-probabilities
-        // from the previous frame to this frame, in both the forward and
-        // backward passes, in order to keep the alphas in a good numeric range.
-        // This won't affect the posteriors, but when computing the total
-        // likelihood we'll need to compensate for it later on.
+        // Let arbitrary_scale be the inverse of the alpha-sum value that we
+        // store in the same place we'd store the alpha for the state numbered
+        // 'num_hmm_states'. We multiply this into all the
+        // transition-probabilities from the previous frame to this frame, in
+        // both the forward and backward passes, in order to keep the alphas in
+        // a good numeric range.  This won't affect the posteriors, but when
+        // computing the total likelihood we'll need to compensate for it later
+        // on.
         BaseFloat arbitrary_scale =
-            1.0 / prev_alpha[special_hmm_state * num_sequences + s];
+            1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s];
         KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0);
         this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
       }
@@ -129,37 +140,89 @@ void DenominatorComputation::AlphaGeneralFrame(int32 t) {
   }
 }
 
+void DenominatorComputation::AlphaDash(int32 t) {
+  BaseFloat *this_alpha = alpha_.RowData(t);
+
+  // create a 'fake matrix' for the regular alphas- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> alpha_mat(this_alpha,
+                                   den_graph_.NumStates(),
+                                   num_sequences_,
+                                   num_sequences_);
+
+  // the alpha-dash is the sum of alpha over all states.
+  CuSubVector<BaseFloat> alpha_sum_vec(this_alpha +
+                                       den_graph_.NumStates() * num_sequences_,
+                                       num_sequences_);
+  alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0);
+
+  alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient,
+                      den_graph_.InitialProbs(),
+                      alpha_sum_vec);
+  // it's now alpha-dash.
+}
+
+// compute beta from beta-dash.
+void DenominatorComputation::Beta(int32 t) {
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
+  // create a 'fake matrix' for the regular beta-dash (which is
+  // the counterpart of alpha-dash)- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> beta_dash_mat(this_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
+  // making the t index implicit, the beta-dash-sum for each sequence is the sum
+  // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i.
+  CuSubVector<BaseFloat> beta_dash_sum_vec(
+      this_beta_dash + den_graph_.NumStates() * num_sequences_,
+      num_sequences_);
+  beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat,
+                              kTrans, den_graph_.InitialProbs(), 0.0);
+  // we are computing beta in place.  After the following, beta-dash-mat
+  // will contain the actual beta (i.e. the counterpart of alpha),
+  // not the beta-dash.
+  beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec);
+}
+
 BaseFloat DenominatorComputation::Forward() {
   AlphaFirstFrame();
-  for (int32 t = 1; t <= frames_per_sequence_; t++)
+  AlphaDash(0);
+  for (int32 t = 1; t <= frames_per_sequence_; t++) {
     AlphaGeneralFrame(t);
+    AlphaDash(t);
+  }
   return ComputeTotLogLike();
 }
 
 BaseFloat DenominatorComputation::ComputeTotLogLike() {
   tot_prob_.Resize(num_sequences_);
-  // View the last alpha as a matrix of size num-hmm-states by num-sequences.
-  CuSubMatrix<BaseFloat> last_alpha(alpha_.RowData(frames_per_sequence_),
-                                    den_graph_.NumStates(),
-                                    num_sequences_,
-                                    num_sequences_);
+  // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences.
+  CuSubMatrix<BaseFloat> last_alpha_dash(
+      alpha_.RowData(frames_per_sequence_),
+      den_graph_.NumStates(),
+      num_sequences_,
+      num_sequences_);
 
-  tot_prob_.AddRowSumMat(1.0, last_alpha, 0.0);
+  tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0);
   // we should probably add an ApplyLog() function that takes a vector argument.
   tot_log_prob_ = tot_prob_;
   tot_log_prob_.ApplyLog();
   BaseFloat tot_log_prob = tot_log_prob_.Sum();
 
-  // We now have to add something for the arbitrary scaling factor.  the
-  // inverses of all the alphas for hmm-states numbered zero, for t = 0
-  // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in the
-  // transition-probs, so we need to multiply them all together (not inversed)
-  // and add them as a correction term to the total log-likes.  Note: the
+  // We now have to add something for the arbitrary scaling factor.  [note: the
   // purpose of the arbitrary scaling factors was to keep things in a good
-  // floating-point range.
+  // floating-point range]
+  // The inverses of all the tot-alpha quantities, for t = 0
+  // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in
+  // the transition-probs, so we need to multiply them all together (not
+  // inversed) and add them as a correction term to the total log-likes.
+  // These tot-alpha quantities were stored in the same place that we would
+  // have stored the HMM-state numbered 'num_hmm_states'.
+  int32 num_hmm_states = den_graph_.NumStates();
   CuSubMatrix<BaseFloat> inv_arbitrary_scales(
       alpha_, 0, frames_per_sequence_,
-      num_sequences_ * den_graph_.SpecialHmmState(), num_sequences_);
+      num_sequences_ * num_hmm_states, num_sequences_);
   CuMatrix<BaseFloat> log_inv_arbitrary_scales(
       inv_arbitrary_scales);
   log_inv_arbitrary_scales.ApplyLog();
@@ -170,12 +233,16 @@ BaseFloat DenominatorComputation::ComputeTotLogLike() {
 
 
 
-void DenominatorComputation::Backward(
+bool DenominatorComputation::Backward(
     BaseFloat deriv_weight,
     CuMatrixBase<BaseFloat> *nnet_output_deriv) {
-  BetaLastFrame();
+  BetaDashLastFrame();
+  Beta(frames_per_sequence_);
   for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) {
-    BetaGeneralFrame(t);
+    BetaDashGeneralFrame(t);
+    if (GetVerboseLevel() >= 1 || t == 0)
+      BetaGeneralFrameDebug(t);
+    Beta(t);
     if (t % kMaxDerivTimeSteps == 0) {
       // commit the derivative stored in exp_nnet_output_transposed_ by adding
       // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'.
@@ -190,35 +257,35 @@ void DenominatorComputation::Backward(
           *nnet_output_deriv,
           t * num_sequences_, chunk_frames * num_sequences_,
           0, num_pdfs);
-      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part,
-                               kTrans);
+      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans);
       if (t != 0)
         transposed_deriv_part.SetZero();
     }
   }
+  return ok_;
 }
 
-void DenominatorComputation::BetaLastFrame() {
-  // sets up the beta on the last frame (frame == frames_per_sequence_).  Note that
-  // the betas we use here contain a 1/(tot-prob) factor in order to simplify
-  // the backprop.
+void DenominatorComputation::BetaDashLastFrame() {
+  // sets up the beta-dash quantity on the last frame (frame ==
+  // frames_per_sequence_).  Note that the betas we use here contain a
+  // 1/(tot-prob) factor in order to simplify the backprop.
 
   int32 t = frames_per_sequence_;
-  BaseFloat *last_frame_beta = beta_.RowData(t % 2);
+  BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2);
 
   // create a 'fake matrix' - view this row as a matrix.
-  CuSubMatrix<BaseFloat> beta_mat(last_frame_beta,
-                                  den_graph_.NumStates(),
-                                  num_sequences_,
-                                  num_sequences_);
+  CuSubMatrix<BaseFloat> beta_dash_mat(last_frame_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
   CuVector<BaseFloat> inv_tot_prob(tot_prob_);
   inv_tot_prob.InvertElements();
   // the beta values at the end of the file only vary with the sequence-index,
   // not with the HMM-index.  We treat all states as having a final-prob of one.
-  beta_mat.CopyRowsFromVec(inv_tot_prob);
+  beta_dash_mat.CopyRowsFromVec(inv_tot_prob);
 }
 
-void DenominatorComputation::BetaGeneralFrame(int32 t) {
+void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
   KALDI_ASSERT(t >= 0 && t < frames_per_sequence_);
   int32 num_pdfs = exp_nnet_output_transposed_.NumRows();
   // t_wrapped gives us the time-index we use when indexing
@@ -226,9 +293,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
   // matrix, storing only chunks of frames at a time, and we add it to the
   // non-transposed output whenever we finish a chunk.
   int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps);
-  const BaseFloat *this_alpha = alpha_.RowData(t),
+  const BaseFloat *this_alpha_dash = alpha_.RowData(t),
       *next_beta = beta_.RowData((t + 1) % 2);
-  BaseFloat *this_beta = beta_.RowData(t % 2);
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
   const Int32Pair *forward_transitions = den_graph_.ForwardTransitions();
   const DenominatorGraphTransition *transitions = den_graph_.Transitions();
   // 'probs' is the matrix of pseudo-likelihoods for frame t.
@@ -238,8 +305,7 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
                      t_wrapped * num_sequences_, num_sequences_);
 
   int32 num_hmm_states = den_graph_.NumStates(),
-      num_sequences = num_sequences_,
-      special_hmm_state = den_graph_.SpecialHmmState();
+      num_sequences = num_sequences_;
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -247,10 +313,9 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
     dim3 dimBlock(std::min<int32>(CU1DBLOCK, num_sequences), 1, 1);
     dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
     cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions,
-                            num_sequences, special_hmm_state,
-                            probs.Data(), probs.Stride(), this_alpha, next_beta,
-                            this_beta, log_prob_deriv.Data(),
-                            log_prob_deriv.Stride());
+                            num_sequences, probs.Data(), probs.Stride(),
+                            this_alpha_dash, next_beta, this_beta_dash,
+                            log_prob_deriv.Data(), log_prob_deriv.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -262,12 +327,12 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
     BaseFloat *log_prob_deriv_data = log_prob_deriv.Data();
     for (int32 h = 0; h < num_hmm_states; h++) {
       for (int32 s = 0; s < num_sequences; s++) {
-        BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
+        BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s],
             inv_arbitrary_scale =
-            this_alpha[special_hmm_state * num_sequences + s];
+            this_alpha_dash[num_hmm_states * num_sequences + s];
         double tot_variable_factor = 0.0;
-        BaseFloat
-            occupation_factor = this_alpha_prob / inv_arbitrary_scale;
+        BaseFloat occupation_factor = this_alpha_dash_prob /
+            inv_arbitrary_scale;
         const DenominatorGraphTransition
             *trans_iter = transitions + forward_transitions[h].first,
             *trans_end = transitions + forward_transitions[h].second;
@@ -282,13 +347,49 @@ void DenominatorComputation::BetaGeneralFrame(int32 t) {
           BaseFloat occupation_prob = variable_factor * occupation_factor;
           log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob;
         }
-        this_beta[h * num_sequences + s] =
+        this_beta_dash[h * num_sequences + s] =
             tot_variable_factor / inv_arbitrary_scale;
       }
     }
   }
 }
 
+void DenominatorComputation::BetaGeneralFrameDebug(int32 t) {
+  BaseFloat num_hmm_states = den_graph_.NumStates(),
+      alpha_beta_size = num_hmm_states * num_sequences_;
+  CuSubVector<BaseFloat> this_alpha_dash(alpha_.RowData(t), alpha_beta_size),
+      this_beta_dash(beta_.RowData(t % 2), alpha_beta_size);
+  int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps),
+      num_pdfs = exp_nnet_output_transposed_.NumRows();
+  CuSubMatrix<BaseFloat> this_log_prob_deriv(
+      nnet_output_deriv_transposed_, 0, num_pdfs,
+      t_wrapped * num_sequences_, num_sequences_);
+  BaseFloat alpha_beta_product = VecVec(this_alpha_dash,
+                                        this_beta_dash),
+      this_log_prob_deriv_sum = this_log_prob_deriv.Sum();
+  if (!ApproxEqual(alpha_beta_product, num_sequences_)) {
+    KALDI_WARN << "On time " << t << ", alpha-beta product "
+               << alpha_beta_product << " != " << num_sequences_
+               << " alpha-dash-sum = " << this_alpha_dash.Sum()
+               << ", beta-dash-sum = " << this_beta_dash.Sum();
+    if (fabs(alpha_beta_product - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+  // use higher tolerance, since we are using randomized pruning for the
+  // log-prob derivatives.
+  if (!ApproxEqual(this_log_prob_deriv_sum,
+                   num_sequences_, 0.01)) {
+    KALDI_WARN << "On time " << t << ", log-prob-deriv sum "
+               << this_log_prob_deriv_sum << " != " << num_sequences_;
+    if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+}
+
 
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index f3b0afa6721..b0f616673d6 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -41,6 +41,153 @@ namespace kaldi {
 namespace chain {
 
 
+/*
+  This extended comment describes how we implement forward-backward without log
+  and without overflow, and also the leaky-HMM idea.
+
+  We'll start by establishing the notation for conventional forward-backward,
+  then add the 'arbitrary-scale' concept that prevents overflow, and then
+  add the 'leaky-hmm' concept.
+
+  All this is done in parallel over multiple sequences, but the computations
+  are independent over the separate sequences, so we won't introduce any notation
+  or index for the sequence; we'll just explain it for one sequences.
+
+  Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
+  hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
+  pred(i) give a list of arcs entering state i, and we'll use notation like:
+    for (j, p, n) in foll(i):
+  for iterating over those arcs, where in this case j is the destination-state,
+  p is the transition-probability of the arc and n is the pdf-id index.
+  We can then look up the emission probability as x(t, n) for some frame
+  0 <= t < T.
+
+  ** Version 1 of the computation (naive version) **
+
+  * Forward computation (version 1)
+
+  In the forward computation we're computing alpha(i, t) for 0 <= t <= T):
+    - For the first frame, set alpha(0, i) = init(i), where init(i) is the
+      initial-probabilitiy from state i.  # in our framework these are obtained
+      #  by running the HMM for a while and getting an averaged occupation
+      # probability, and using this as an initial-prob, since the boundaries of
+      # chunks don't really correspond to utterance boundaries in general.]
+    - For t = 1 ... T:
+        for i = 0 ... I-1:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p.
+
+    - total-prob = \sum_i alpha(T, i).  # note, we take the final-probs of all states
+                                        # to be 1.0.
+
+  * Backward computation (version 1)
+
+  And now for the backward computation.  Contrary to tradition, we include the
+  inverse of the total-prob as a factor in the betas.  This is both more
+  convenient (it simplifies the way we obtain posteriors), and makes the
+  algorithm more generalizable as all the beta quantities can be interpreted as
+  the partial derivative of the logprob with respect to their corresponding
+  alpha.
+
+  In forward backward notation, gamma is normally used for state-level
+  occupation probabilities, but what we care about here is pdf-id-level
+  occupation probabilities (i.e. the partial derivative of the log-likelihood
+  w.r.t. the logs of the x(t, n) quantities), so we use gamma for that.
+
+    - for the final frame:
+       for each i, beta(T, i) = 1 / total-prob.
+    - for t = T-1 ... 0:
+        for i = 0 ... I-1:
+           beta(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta(t, i) += x(t, n) * beta(t+1, j) * p.
+              gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p.
+
+  ** Version 2 of the computation (renormalized version) **
+
+  Version 1 of the algorithm is susceptible to numeric underflow and overflow,
+  due to the limited range of IEEE floating-point exponents.
+  Define tot-alpha(t) = \sum_i alpha(t, i).  Then the renormalized version of
+  the computation is as above, except whenever the quantity x(t, n) appears,
+  we replace it with x(t, n) / alpha(t).  In the algorithm we refer to
+  1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any
+  value here as long as we are consistent and the value only varies with t
+  and not with n; we'll always get the same posteriors (gamma).
+
+  When the algorithm outputs log(total-prob) as the total log-probability
+  of the HMM, we have to instead return the expression:
+    log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t).
+  to correct for the scaling of the x values.
+
+  The algorithm is still vulnerable to overflow in the beta computation because
+  it's possible that the dominant path could have a very tiny alpha.  However,
+  once we introduce the leaky-HMM idea (below), this problem will disappear.
+
+  ** Version 3 of the computation (leaky-HMM version) **
+
+  The leaky-HMM idea is intended to improve generalization by allowing paths
+  other than those explicitly allowed by the FST we compiled.  Another way to
+  look at it is as a way of hedging our bets about where we split the utterance,
+  so it's as we're marginalizing over different splits of the utterance.  You
+  could also think of it as a modification of the FST so that there is an
+  epsilon transition from each state to a newly added state, with probability
+  one, and then an epsilon transition from the newly added state to each state
+  with probability leaky-hmm-prob * init(i) [except we need a mechanism so that
+  no more than two epsilon transitions can be taken per frame- this would involve
+  creating two copies of the states]
+
+  Recall that we mentioned that init(i) is the initial-probability of
+  HMM-state i, but these are obtained in such a way that they can be treated
+  as priors, or average occupation-probabilities.
+
+  Anyway, the way we formulate leaky-hmm is as follows:
+
+  * Forward computation (version 3)
+
+  Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical
+  value.  It defines how much probability we give to the 'leaky' transitions.
+
+  - For frame 0, set alpha(0, i) = init(i).
+  - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i).
+  - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i).
+
+  - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use
+      the previous frame's alpha' instead of alpha.  That is:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1)
+
+  - total-prob = \sum_i alpha'(T, i)
+
+  The corrected log-prob that we return from the algorithm will be
+   (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)).
+
+  * Backward computation (version 3)
+
+  The backward computation is as follows.  It is fairly straightforward to
+  derive if you think of it as an instance of backprop where beta, tot-beta and
+  beta' are the partial derivatives of the output log-prob w.r.t. the
+  corresponding alpha, tot-alpha and alpha' quantities.  Note, tot-beta is not
+  really the sum of the betas as its name might suggest, it's just the
+  derivative w.r.t. tot-alpha.
+
+   - beta'(T, i) = 1 / total-prob.
+   - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i)
+   - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t).
+   - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows:
+        for 0 <= i < I:
+           beta'(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t)
+              gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p *  x(t, n) / tot-alpha(t)
+
+   Note: in the code, the tot-alpha and tot-beta quantities go in the same
+   memory location that the corresponding alpha and beta for state I would go.
+
+ */
+
+
 // This does forward-backward in parallel on a number of sequences, using a
 // single HMM.
 class DenominatorComputation {
@@ -70,7 +217,8 @@ class DenominatorComputation {
 
   // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
   // nnet output), to 'nnet_output_deriv'.
-  void Backward(BaseFloat deriv_weight,
+  // returns true if everything seemed OK, false if a failure was detected.
+  bool Backward(BaseFloat deriv_weight,
                 CuMatrixBase<BaseFloat> *nnet_output_deriv);
 
  private:
@@ -84,6 +232,9 @@ class DenominatorComputation {
   void AlphaFirstFrame();
   // the alpha computation for some 0 < t <= num_time_steps_.
   void AlphaGeneralFrame(int32 t);
+  // does the 'alpha-dash' computation for time t.  this relates to
+  // 'leaky hmm'.
+  void AlphaDash(int32 t);
 
   // done after all the alphas, this function computes and returns the total
   // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're
@@ -92,9 +243,15 @@ class DenominatorComputation {
   // from the Forward() computation).
   BaseFloat ComputeTotLogLike();
 
-  void BetaLastFrame();
+  void BetaDashLastFrame();
   // beta computation for 0 <= beta < num_time_steps_.
-  void BetaGeneralFrame(int32 t);
+  void BetaDashGeneralFrame(int32 t);
+  // compute the beta quantity from the beta-dash quantity (relates to leaky hmm).
+  void Beta(int32 t);
+
+  // some checking that we can do if debug mode is activated, or on frame zero.
+  // Sets ok_ to false if a bad problem is detected.
+  void BetaGeneralFrameDebug(int32 t);
 
   const ChainTrainingOptions &opts_;
   const DenominatorGraph &den_graph_;
@@ -116,13 +273,18 @@ class DenominatorComputation {
   // the derivs w.r.t. the nnet outputs (transposed)
   CuMatrix<BaseFloat> nnet_output_deriv_transposed_;
 
-  // the alpha probabilities; dimension is (frames_per_sequence + 1) by (num-hmm-states
-  // * num-sequences).  Note, they are not logs.
+  // the (temporarily) alpha and (more permanently) alpha-dash probabilities;
+  // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences +
+  // num_sequences).  Note, they are not logs.  The last 'num_sequences'
+  // columns, where the alpha for the state indexed 'num_hmm_states' would live,
+  // are for the alpha-sums, which relates to leaky HMM.
   CuMatrix<BaseFloat> alpha_;
 
-  // the beta probabilities (rolling buffer); dimension is 2 * (num-hmm-states *
-  // num-sequences).  Note: for efficiency and to simplify the equations, these
-  // are actually the beta / tot_prob_.
+  // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 *
+  // (num-hmm-states * num-sequences + num_sequences).  [the last
+  // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.]
+  // Note: for efficiency and to simplify the equations, these are actually the
+  // beta / tot_prob_.
   CuMatrix<BaseFloat> beta_;
 
   // the total probability for each sequence, excluding the product of
@@ -136,11 +298,13 @@ class DenominatorComputation {
   CuVector<BaseFloat> tot_log_prob_;
 
   // the log of the total correction term for each sequence, which is the
-  // product of the alpha_[special hmm state] over all the frames.  The
-  // 'correction terms' are terms that we divide the alphas and betas by in
-  // order to keep them in a good dynamic range.  The product of them
-  // must be included in the total likelihood.
+  // product of the alpha-sums [used in the leaky-hmm computation] over all the
+  // frames.  The 'correction terms' are terms that we divide the alphas and
+  // betas by in order to keep them in a good dynamic range.  The product of
+  // them must be included in the total likelihood.
   CuVector<BaseFloat> log_correction_term_;
+
+  bool ok_;
 };
 
 
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index af7a1a6b176..8ec1dcf322c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -29,7 +29,6 @@ extern "C" {
                                const Int32Pair *forward_transitions,
                                const DenominatorGraphTransition *transitions,
                                int32_cuda num_sequences,
-                               int32_cuda special_hmm_state,
                                const BaseFloat *probs,
                                int32_cuda prob_stride,
                                const BaseFloat *this_alpha,
@@ -42,7 +41,6 @@ extern "C" {
                               const Int32Pair *backward_transitions,
                               const DenominatorGraphTransition *transitions,
                               int32_cuda num_sequences,
-                              int32_cuda special_hmm_state,
                               const BaseFloat *probs,
                               int32_cuda prob_stride,
                               const BaseFloat *prev_alpha,
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 8fcf8037d36..ea10b6680f0 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -40,9 +40,9 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) {
   // threshold itself with probability (value / threshold).  This preserves
   // expectations.  Note: we assume that value >= 0.
 
-  // you can choose any value for the threshold, but powers of 2 are nice
-  // because they will exactly preserve the precision of the value.
-  const Real threshold = 1.0 / (1 << 14);
+  // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines
+  // the threshold for randomized posterior pruning.
+  const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo);
   if (value >= threshold) {
     atomic_add(address, value);
   } else {
@@ -67,7 +67,6 @@ __device__ inline void atomic_add_thresholded(Real* address, Real value) {
     if ((x >> 12) > (x & 4095))
       atomic_add(address, threshold);
   }
-
 }
 
 // one iteration of the forward computation in the 'tombstone' CTC HMM computation.
@@ -82,7 +81,6 @@ __global__
 static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
                                     const DenominatorGraphTransition *transitions,
                                     int32_cuda num_sequences,
-                                    int32_cuda special_hmm_state,
                                     const BaseFloat *probs,
                                     int32_cuda prob_stride,
                                     const BaseFloat *prev_alpha,
@@ -137,15 +135,18 @@ static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
     this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0;
   }
 
-  // Let arbitrary_scale be the inverse of the alpha value for the
-  // hmm-state indexed special_hmm_state_ on the previous frame (for this
-  // sequence); we multiply this into all the transition-probabilities
-  // from the previous frame to this frame, in both the forward and
-  // backward passes, in order to keep the alphas in a good numeric range.
-  // This won't affect the posteriors, but when computing the total
-  // likelihood we'll need to compensate for it later on.
+  int32_cuda num_hmm_states = gridDim.y;
+  // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the
+  // previous frame this sum of all the alpha values is stored in the place that
+  // we'd store the previous alpha for state-index equal to num_hmm_states
+  // (i.e. one past the end).  We multiply this into all the
+  // transition-probabilities from the previous frame to this frame, in both the
+  // forward and backward passes, in order to keep the alphas in a good numeric
+  // range.  This won't affect the posteriors, as it's just a constant factor
+  // for each frame, but when computing the total likelihood we'll need to
+  // compensate for it later on.
   BaseFloat arbitrary_scale =
-      1.0 / prev_alpha[special_hmm_state * num_sequences + s];
+      1.0 / prev_alpha[num_hmm_states * num_sequences + s];
   this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
 }
 
@@ -154,7 +155,6 @@ __global__
 static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
                                      const DenominatorGraphTransition *transitions,
                                      int32_cuda num_sequences,
-                                     int32_cuda special_hmm_state,
                                      const BaseFloat *probs, int32_cuda prob_stride,
                                      const BaseFloat *this_alpha, const BaseFloat *next_beta,
                                      BaseFloat *this_beta, BaseFloat *log_prob_deriv,
@@ -179,10 +179,14 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
   if (s >= num_sequences)
     return;
 
+  // below, you can read 'gridDim.y' as 'num_hmm_states'.  See where
+  // arbitrary_scale is defined in the forward computation above, for more
+  // explanation.
   BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
       inv_arbitrary_scale =
-      this_alpha[special_hmm_state * num_sequences + s];
+      this_alpha[gridDim.y * num_sequences + s];
   double tot_variable_factor = 0.0;
+
   BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale;
   const DenominatorGraphTransition
       *trans_iter = transitions + forward_transitions[h].first,
@@ -223,7 +227,8 @@ static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
     atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s),
                            occupation_prob0);
   }
-  this_beta[h * num_sequences + s] = tot_variable_factor / inv_arbitrary_scale;
+  BaseFloat beta = tot_variable_factor / inv_arbitrary_scale;
+  this_beta[h * num_sequences + s] = beta;
 }
 
 
@@ -231,28 +236,26 @@ void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl,
                             const Int32Pair *backward_transitions,
                             const DenominatorGraphTransition *transitions,
                             int32_cuda num_sequences,
-                            int32_cuda special_hmm_state,
                             const BaseFloat *probs, int32_cuda prob_stride,
                             const BaseFloat *prev_alpha,
                             BaseFloat *this_alpha) {
   _cuda_chain_hmm_forward<<<Gr,Bl>>>(backward_transitions, transitions,
-                                     num_sequences, special_hmm_state,
-                                     probs, prob_stride, prev_alpha, this_alpha);
+                                     num_sequences, probs, prob_stride,
+                                     prev_alpha, this_alpha);
 }
 
 void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                              const Int32Pair *forward_transitions,
                              const DenominatorGraphTransition *transitions,
                              int32_cuda num_sequences,
-                             int32_cuda special_hmm_state,
                              const BaseFloat *probs, int32_cuda prob_stride,
                              const BaseFloat *this_alpha, const BaseFloat *next_beta,
                              BaseFloat *this_beta,
                              BaseFloat *log_prob_deriv,
                              int32_cuda log_prob_deriv_stride) {
   _cuda_chain_hmm_backward<<<Gr,Bl>>>(forward_transitions, transitions,
-                                      num_sequences, special_hmm_state,
-                                      probs, prob_stride, this_alpha, next_beta,
+                                      num_sequences, probs, prob_stride,
+                                      this_alpha, next_beta,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
index 1dc9d9d489d..15cb31e0571 100644
--- a/src/chain/chain-numerator.h
+++ b/src/chain/chain-numerator.h
@@ -76,8 +76,8 @@ class NumeratorComputation {
   BaseFloat Forward();
 
   // Does the backward computation and (efficiently) adds the derivative of the
-  // nnet output w.r.t. the (log-prob times supervision_.weight) to
-  // 'nnet_output_deriv'.
+  // nnet output w.r.t. the (log-prob times supervision_.weight times
+  // deriv_weight) to 'nnet_output_deriv'.
   void Backward(CuMatrixBase<BaseFloat> *nnet_output_deriv);
 
  private:
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index e6a333317e8..ea673df3291 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -251,15 +251,17 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
     nnet_output.SetRandn();
 
   ChainTrainingOptions opts;
+  if (RandInt(0, 1) == 1)
+    opts.leaky_hmm_coefficient = 0.2;
 
   CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
                                         nnet_output.NumCols(),
                                         kUndefined);
 
-  BaseFloat objf, weight;
+  BaseFloat objf, l2_term, weight;
 
   ComputeChainObjfAndDeriv(opts, den_graph, supervision,
-                           nnet_output, &objf, &weight,
+                           nnet_output, &objf, &l2_term, &weight,
                            &nnet_output_deriv);
 
   {
@@ -296,11 +298,12 @@ void ChainTrainingTest(const DenominatorGraph &den_graph,
     CuMatrix<BaseFloat> nnet_output_perturbed(nnet_delta_output);
     nnet_output_perturbed.AddMat(1.0, nnet_output);
 
-    BaseFloat objf_modified, weight_modified;
+    BaseFloat objf_modified, l2_term_modified, weight_modified;
 
     ComputeChainObjfAndDeriv(opts, den_graph, supervision,
                              nnet_output_perturbed,
-                             &objf_modified, &weight_modified,
+                             &objf_modified, &l2_term_modified,
+                             &weight_modified,
                              NULL);
 
     observed_objf_changes(p) = objf_modified - objf;
@@ -419,21 +422,6 @@ void ChainDenominatorTest(const DenominatorGraph &den_graph) {
                  10.0);
   }
 
-  { // another check: that scaling the initial probs has the expected effect.
-    BaseFloat scale = 0.1 + 0.7 * RandUniform();
-    DenominatorGraph den_graph_scaled(den_graph);
-    den_graph_scaled.ScaleInitialProbs(scale);
-    DenominatorComputation denominator_computation_scaled_initial(
-        opts, den_graph_scaled,
-        num_sequences, nnet_output);
-    BaseFloat forward_prob_scaled_initial =
-        denominator_computation_scaled_initial.Forward();
-    BaseFloat observed_difference =
-        forward_prob_scaled_initial - forward_prob,
-        predicted_difference = num_sequences * log(scale);
-    AssertEqual(observed_difference, predicted_difference);
-  }
-
   int32 num_tries = 5;
   BaseFloat epsilon = 1.0e-04;
   Vector<BaseFloat> predicted_objf_changes(num_tries),
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 03fdb3cbe64..a1972736c68 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -679,60 +679,6 @@ bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
   return true;
 }
 
-void SplitIntoRanges(int32 num_frames,
-                     int32 frames_per_range,
-                     std::vector<int32> *range_starts) {
-  if (frames_per_range > num_frames) {
-    range_starts->clear();
-    return;  // there is no room for even one range.
-  }
-  int32 num_ranges = num_frames  / frames_per_range,
-      extra_frames = num_frames % frames_per_range;
-  // this is a kind of heuristic.  If the number of frames we'd
-  // be skipping is less than 1/4 of the frames_per_range, then
-  // skip frames; otherwise, duplicate frames.
-  // it's important that this is <=, not <, so that if
-  // extra_frames == 0 and frames_per_range is < 4, we
-  // don't insert an extra range.
-  if (extra_frames <= frames_per_range / 4) {
-    // skip frames.  we do this at start or end, or between ranges.
-    std::vector<int32> num_skips(num_ranges + 1, 0);
-    for (int32 i = 0; i < extra_frames; i++)
-      num_skips[RandInt(0, num_ranges)]++;
-    range_starts->resize(num_ranges);
-    int32 cur_start = num_skips[0];
-    for (int32 i = 0; i < num_ranges; i++) {
-      (*range_starts)[i] = cur_start;
-      cur_start += frames_per_range;
-      cur_start += num_skips[i + 1];
-    }
-    KALDI_ASSERT(cur_start == num_frames);
-  } else {
-    // duplicate frames.
-    num_ranges++;
-    int32 num_duplicated_frames = frames_per_range - extra_frames;
-    // the way we handle the 'extra_frames' frames of output is that we
-    // backtrack zero or more frames between outputting each pair of ranges, and
-    // the total of these backtracks equals 'extra_frames'.
-    std::vector<int32> num_backtracks(num_ranges, 0);
-    for (int32 i = 0; i < num_duplicated_frames; i++) {
-      // num_ranges - 2 below is not a bug.  we only want to backtrack
-      // between ranges, not past the end of the last range (i.e. at
-      // position num_ranges - 1).  we make the vector one longer to
-      // simplify the loop below.
-      num_backtracks[RandInt(0, num_ranges - 2)]++;
-    }
-    range_starts->resize(num_ranges);
-    int32 cur_start = 0;
-    for (int32 i = 0; i < num_ranges; i++) {
-      (*range_starts)[i] = cur_start;
-      cur_start += frames_per_range;
-      cur_start -= num_backtracks[i];
-    }
-    KALDI_ASSERT(cur_start == num_frames);
-  }
-}
-
 bool Supervision::operator == (const Supervision &other) const {
   return weight == other.weight && num_sequences == other.num_sequences &&
       frames_per_sequence == other.frames_per_sequence &&
@@ -755,50 +701,6 @@ void Supervision::Check(const TransitionModel &trans_mdl) const {
     KALDI_ERR << "Num-frames does not match fst.";
 }
 
-void GetWeightsForRanges(int32 range_length,
-                         const std::vector<int32> &range_starts,
-                         std::vector<Vector<BaseFloat> > *weights) {
-  KALDI_ASSERT(range_length > 0);
-  int32 num_ranges = range_starts.size();
-  weights->resize(num_ranges);
-  for (int32 i = 0; i < num_ranges; i++) {
-    (*weights)[i].Resize(range_length);
-    (*weights)[i].Set(1.0);
-  }
-  for (int32 i = 0; i + 1 < num_ranges; i++) {
-    int32 j = i + 1;
-    int32 i_start = range_starts[i], i_end = i_start + range_length,
-          j_start = range_starts[j];
-    KALDI_ASSERT(j_start > i_start);
-    if (i_end > j_start) {
-      Vector<BaseFloat> &i_weights = (*weights)[i], &j_weights = (*weights)[j];
-
-      int32 overlap_length = i_end - j_start;
-      // divide the overlapping piece of the 2 ranges into 3 regions of
-      // approximately equal size, called the left, middle and right region.
-      int32 left_length = overlap_length / 3,
-          middle_length = (overlap_length - left_length) / 2,
-           right_length = overlap_length - left_length - middle_length;
-      KALDI_ASSERT(left_length >= 0 && middle_length >= 0 && right_length >= 0 &&
-                   left_length + middle_length + right_length == overlap_length);
-      // set the weight of the left region to be zero for the right (j) range.
-      for (int32 k = 0; k < left_length; k++)
-        j_weights(k) = 0.0;
-      // set the weight of the right region to be zero for the left (i) range.
-      for (int32 k = 0; k < right_length; k++)
-        i_weights(range_length - 1 - k) = 0.0;
-      // for the middle range, linearly interpolate between the 0's and 1's.
-      // note: we multiply with existing weights instead of set in order to get
-      // more accurate behavior in the unexpected case where things triply
-      // overlap.
-      for (int32 k = 0; k < middle_length; k++) {
-        BaseFloat weight = (0.5 + k) / middle_length;
-        j_weights(left_length + k) = weight;
-        i_weights(range_length - 1 - right_length - k) = weight;
-      }
-    }
-  }
-}
 
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index b17f62d00ad..0ca12e628e1 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -364,42 +364,27 @@ void AppendSupervision(const std::vector<const Supervision*> &input,
                        std::vector<Supervision> *output_supervision);
 
 
-/// This function helps you to pseudo-randomly split a sequence of length 'num_frames',
-/// interpreted as frames 0 ... num_frames - 1, into pieces of length exactly
-/// 'frames_per_range', to be used as examples for training.  Because frames_per_range
-/// may not exactly divide 'num_frames', this function will leave either small gaps or
-/// small overlaps in pseudo-random places.
-/// The output 'range_starts' will be set to a list of the starts of ranges, the
-/// output ranges are of the form
-/// [ (*range_starts)[i] ... (*range_starts)[i] + frames_per_range - 1 ].
-void SplitIntoRanges(int32 num_frames,
-                     int32 frames_per_range,
-                     std::vector<int32> *range_starts);
-
-
-/// This utility function is not used directly in the 'chain' code.  It is used
-/// to get weights for the derivatives, so that we don't doubly train on some
-/// frames after splitting them up into overlapping ranges of frames.  The input
-/// 'range_starts' will be obtained from 'SplitIntoRanges', but the
-/// 'range_length', which is a length in frames, may be longer than the one
-/// supplied to SplitIntoRanges, due the 'overlap'.  (see the calling code...
-/// if we want overlapping ranges, we get it by 'faking' the input to
-/// SplitIntoRanges).
-///
-/// The output vector 'weights' will be given the same dimension as
-/// 'range_starts'.  By default the output weights in '*weights' will be vectors
-/// of all ones, of length equal to 'range_length', and '(*weights)[i]' represents
-/// the weights given to frames numbered
-///   t = range_starts[i] ... range_starts[i] + range_length - 1.
-/// If these ranges for two successive 'i' values overlap, then we
-/// reduce the weights to ensure that no 't' value gets a total weight
-/// greater than 1.  We do this by dividing the overlapped region
-/// into three approximately equal parts, and giving the left part
-/// to the left range; the right part to the right range; and
-/// in between, interpolating linearly.
-void GetWeightsForRanges(int32 range_length,
-                         const std::vector<int32> &range_starts,
-                         std::vector<Vector<BaseFloat> > *weights);
+
+/// This is a newer version of GetWeightsForRanges with a simpler behavior
+/// than GetWeightsForRanges and a different purpose.  Instead of aiming to
+/// create weights that sum to one over the whole file, the purpose is to
+/// zero out the derivative weights for a certain number of frames to each
+/// side of every 'cut point' in the numerator lattice [by numerator lattice,
+/// what I mean is the FST that we automatically generate from the numerator
+/// alignment or lattice].  So we don't zero out the weights for the very
+/// beginning or very end of each original utterance, just those where
+/// we split the utterance into pieces.  We believe there is an incentive
+/// for the network to produce deletions near the edges, and this aims to fix
+/// this problem.
+/// range_length is the length of each range of times (so range_starts[0]
+/// represents the start of a range of t values of length 'range_length'
+/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames
+/// on each side of the cut point on which we are supposed to zero out the
+/// derivative.
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights);
 
 
 typedef TableWriter<KaldiObjectHolder<Supervision> > SupervisionWriter;
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 42cdfed2713..1bf0201fbfa 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -29,9 +29,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
-                              BaseFloat *tot_objf,
-                              BaseFloat *tot_weight,
-                              CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,                              
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv) {
   BaseFloat num_logprob_weighted;
   if (nnet_output_deriv)
     nnet_output_deriv->SetZero();
@@ -40,29 +42,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
     // note: supervision.weight is included as a factor in the derivative from
     // the numerator object, and the logprob too.
     num_logprob_weighted = numerator.Forward();
-    if (nnet_output_deriv)
+    if (nnet_output_deriv) {
       numerator.Backward(nnet_output_deriv);
+      if (xent_output_deriv)
+        xent_output_deriv->CopyFromMat(*nnet_output_deriv);
+    } else if (xent_output_deriv) {
+      // this branch will be taken if xent_output_deriv but not
+      // nnet_output_deriv is set- which could happen if you want to compute the
+      // cross-entropy objective but not the derivatives.
+      xent_output_deriv->SetZero();
+      numerator.Backward(xent_output_deriv);
+    }
   }
   DenominatorComputation denominator(opts, den_graph,
                                      supervision.num_sequences,
                                      nnet_output);
 
   BaseFloat den_logprob = denominator.Forward();
+  bool ok = true;
   if (nnet_output_deriv)
-    denominator.Backward(-supervision.weight,
-                         nnet_output_deriv);
+    ok = denominator.Backward(-supervision.weight,
+                              nnet_output_deriv);
 
-  *tot_objf = num_logprob_weighted - supervision.weight * den_logprob;
-  *tot_weight = supervision.weight * supervision.num_sequences *
+  *objf = num_logprob_weighted - supervision.weight * den_logprob;
+  *weight = supervision.weight * supervision.num_sequences *
       supervision.frames_per_sequence;
-  if (!(*tot_objf  == *tot_objf)) {
-    // inf or NaN detected
+  if (!((*objf) - (*objf) == 0) || !ok) {
+    // inf or NaN detected, or denominator computation returned false.
     if (nnet_output_deriv)
       nnet_output_deriv->SetZero();
+    if (xent_output_deriv)
+      xent_output_deriv->SetZero();
     BaseFloat default_objf = -10;
-    KALDI_WARN << "Objective function is " << (*tot_objf)
-               << ", setting to " << default_objf << " per frame.";
-    *tot_objf  = default_objf * *tot_weight;
+    KALDI_WARN << "Objective function is " << (*objf)
+               << " and denominator computation (if done) returned "
+               << std::boolalpha << ok
+               << ", setting objective function to " << default_objf
+               << " per frame.";
+    *objf  = default_objf * *weight;
   }
 
   // This code helps us see how big the derivatives are, on average,
@@ -81,6 +98,16 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
       row_products_per_frame(i / num_sequences) += row_products_cpu(i);
     KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
   }
+
+  if (opts.l2_regularize == 0.0) {
+    *l2_term = 0.0;
+  } else {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision.weight * opts.l2_regularize;
+    *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans);
+    if (nnet_output_deriv)
+      nnet_output_deriv->AddMat(-1.0 * scale, nnet_output);
+  }
 }
 
 
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 8eb7e8343f4..e6143d10846 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -40,11 +40,44 @@ namespace chain {
 
 
 struct ChainTrainingOptions {
-  // Currently empty.
-
-  ChainTrainingOptions() { }
-
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+  BaseFloat l2_regularize;
+
+  // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
+  // each state to a special state with probability one, and then another
+  // epsilon-transition from that special state to each state, with probability
+  // leaky_hmm_coefficient times [initial-prob of destination state].  Imagine
+  // we make two copies of each state prior to doing this, version A and version
+  // B, with transition from A to B, so we don't have to consider epsilon loops-
+  // or just imagine the coefficient is small enough that we can ignore the
+  // epsilon loops.
+  BaseFloat leaky_hmm_coefficient;
+
+
+  // Cross-entropy regularization constant.  (e.g. try 0.1).  If nonzero,
+  // the network is expected to have an output named 'output-xent', which
+  // should have a softmax as its final nonlinearity.
+  BaseFloat xent_regularize;
+
+  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+                          xent_regularize(0.0) { }
+  
   void Register(OptionsItf *opts) {
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' training, applied to the output "
+                   "of the neural net.");
+    opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
+                   "that allows transitions from each HMM state to each other "
+                   "HMM state, to ensure gradual forgetting of context (can "
+                   "improve generalization).  For numerical reasons, may not be "
+                   "exactly zero.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for 'chain' training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
   }
 };
 
@@ -59,10 +92,13 @@ struct ChainTrainingOptions {
                             paths and constraints on the alignment as an FST
    @param [in] nnet_output  The output of the neural net; dimension must equal
                           ((supervision.num_sequences * supervision.frames_per_sequence) by
-                            den_graph.NumPdfs()).
+                            den_graph.NumPdfs()).  The rows are ordered as: all sequences
+                            for frame 0; all sequences for frame 1; etc.
    @param [out] objf       The [num - den] objective function computed for this
                            example; you'll want to divide it by 'tot_weight' before
                            displaying it.
+   @param [out] l2_term  The l2 regularization term in the objective function, if
+                           the --l2-regularize option is used.  To be added to 'o
    @param [out] weight     The weight to normalize the objective function by;
                            equals supervision.weight * supervision.num_sequences *
                            supervision.frames_per_sequence.
@@ -70,14 +106,22 @@ struct ChainTrainingOptions {
                            the neural-net output.  Only written to if non-NULL.
                            You don't have to zero this before passing to this function,
                            we zero it internally.
+   @param [out] xent_output_deriv  If non-NULL, then the numerator part of the derivative
+                           (which equals a posterior from the numerator forward-backward,
+                           scaled by the supervision weight) is written to here.  This will
+                           be used in the cross-entropy regularization code.  This value
+                           is also used in computing the cross-entropy objective value.
 */
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
-                              BaseFloat *tot_objf,
-                              BaseFloat *tot_weight,
-                              CuMatrixBase<BaseFloat> *nnet_output_deriv);
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
+                              
 
 
 }  // namespace chain
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 3bdf710c489..3f092879b6e 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -40,9 +40,11 @@ class NnetChainLdaStatsAccumulator {
 
   void AccStats(const NnetChainExample &eg) {
     ComputationRequest request;
-    bool need_backprop = false, store_stats = false;
+    bool need_backprop = false, store_stats = false,
+        need_xent = false, need_xent_deriv = false;
 
-    GetChainComputationRequest(nnet_, eg, need_backprop, store_stats, &request);
+    GetChainComputationRequest(nnet_, eg, need_backprop, store_stats,
+                               need_xent, need_xent_deriv, &request);
 
     const NnetComputation &computation = *(compiler_.Compile(request));
 
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 4e32d280638..6820ee125e0 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -25,6 +25,7 @@
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -48,6 +49,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         int32 frames_per_eg,
                         int32 frames_overlap_per_eg,
                         int32 frame_subsampling_factor,
+                        int32 cut_zero_frames,
                         int64 *num_frames_written,
                         int64 *num_egs_written,
                         NnetChainExampleWriter *example_writer) {
@@ -78,7 +80,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   // Instead we select ranges of frames that fully fit within the file;  these
   // might slightly overlap with each other or have gaps.
   std::vector<int32> range_starts_subsampled;
-  chain::SplitIntoRanges(num_feature_frames_subsampled -
+  SplitIntoRanges(num_feature_frames_subsampled -
                          frames_overlap_subsampled,
                          frames_shift_subsampled,
                          &range_starts_subsampled);
@@ -88,10 +90,16 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   // to the edge are not as accurate as they could be, because when we split we
   // don't know the correct alphas and betas).
   std::vector<Vector<BaseFloat> > deriv_weights;
-  chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                             range_starts_subsampled,
-                             &deriv_weights);
-
+  if (cut_zero_frames >= 0)
+    GetWeightsForRangesNew(frames_per_eg_subsampled,
+                                  cut_zero_frames / frame_subsampling_factor,
+                                  range_starts_subsampled,
+                                  &deriv_weights);
+  else
+    GetWeightsForRanges(frames_per_eg_subsampled,
+                               range_starts_subsampled,
+                               &deriv_weights);
+  
   if (range_starts_subsampled.empty()) {
     KALDI_WARN << "No output for utterance " << utt_id
                << " (num-frames=" << num_feature_frames
@@ -177,35 +185,6 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
   return true;
 }
 
-void RoundUpNumFrames(int32 frame_subsampling_factor,
-                      int32 *num_frames,
-                      int32 *num_frames_overlap) {
-  if (*num_frames % frame_subsampling_factor != 0) {
-    int32 new_num_frames = frame_subsampling_factor *
-        (*num_frames / frame_subsampling_factor + 1);
-    KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
-              << " to a multiple of --frame-subsampling-factor="
-              << frame_subsampling_factor
-              << ", now --num-frames=" << new_num_frames;
-    *num_frames = new_num_frames;
-  }
-  if (*num_frames_overlap % frame_subsampling_factor != 0) {
-    int32 new_num_frames_overlap = frame_subsampling_factor *
-        (*num_frames_overlap / frame_subsampling_factor + 1);
-    KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
-              << " to a multiple of --frame-subsampling-factor="
-              << frame_subsampling_factor
-              << ", now --num-frames-overlap=" << new_num_frames_overlap;
-    *num_frames_overlap = new_num_frames_overlap;
-  }
-  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
-    KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
-              << "--num-frames=" << (*num_frames);
-  }
-
-}
-
-
 } // namespace nnet2
 } // namespace kaldi
 
@@ -237,6 +216,7 @@ int main(int argc, char *argv[]) {
     bool compress = true;
     int32 left_context = 0, right_context = 0, num_frames = 1,
         num_frames_overlap = 0, length_tolerance = 100,
+        cut_zero_frames = -1,
         frame_subsampling_factor = 1;
 
     std::string ivector_rspecifier;
@@ -244,6 +224,10 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
+    po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames "
+                "(measured before subsampling) to zero the derivative on each "
+                "side of a cut point (if set, activates new-style derivative "
+                "weights)");
     po.Register("left-context", &left_context, "Number of frames of left "
                 "context the neural net requires.");
     po.Register("right-context", &right_context, "Number of frames of right "
@@ -338,9 +322,10 @@ int main(int argc, char *argv[]) {
           continue;
         }
         if (ProcessFile(normalization_fst, feats, ivector_feats, supervision,
-                        key, compress, left_context, right_context, num_frames,
+                        key, compress,
+                        left_context, right_context, num_frames,
                         num_frames_overlap, frame_subsampling_factor,
-                        &num_frames_written, &num_egs_written,
+                        cut_zero_frames, &num_frames_written, &num_egs_written,
                         &example_writer))
           num_done++;
         else
diff --git a/src/configure b/src/configure
index c90e9ba4ee0..acd63da0d84 100755
--- a/src/configure
+++ b/src/configure
@@ -177,7 +177,10 @@ do
   esac
 done
 
-
+# the idea here is that if you change the configuration options from using
+# CUDA to not using it, or vice versa, we want to recompile all parts of the
+# code that may use a GPU.  Touching this file is a way to force this.
+touch cudamatrix/cu-common.h 2>/dev/null
 
 function failure {
   echo "***configure failed: $* ***" >&2
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 8718c49eea5..2b23bf0b621 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -51,19 +51,20 @@ void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
                                            dim3 *dimBlock) {
   KALDI_ASSERT(num_rows > 0 && num_cols > 0);
   int32 col_blocksize = 64, row_blocksize = 4;
-  while (num_cols + (num_cols / 2) <= col_blocksize &&
-         num_rows > 65536 * row_blocksize) {
+  while (col_blocksize > 1 &&
+         (num_cols + (num_cols / 2) <= col_blocksize ||
+          num_rows > 65536 * row_blocksize)) {
     col_blocksize /= 2;
     row_blocksize *= 2;
   }
 
-  KALDI_ASSERT(col_blocksize > 0 && "Matrix too large to process");
-
   dimBlock->x = col_blocksize;
   dimBlock->y = row_blocksize;
   dimBlock->z = 1;
   dimGrid->x = n_blocks(num_cols, col_blocksize);
   dimGrid->y = n_blocks(num_rows, row_blocksize);
+  KALDI_ASSERT(dimGrid->y <= 65536 &&
+               "Matrix has too many rows to process");
   dimGrid->z = 1;
 }
 #endif
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 804bea1a217..2d8aae1808c 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -59,6 +59,7 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign,  MatrixDim d);
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
@@ -198,6 +199,7 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d);
 void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
+void cudaD_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 00af3eb234a..422bc5af2f3 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -931,15 +931,15 @@ static void _add_diag_mat_mat(
   int v_idx = i / threads_per_element,   // v_idx is the index into v that we are supposed to
       sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells
                                          // us which block of elements we sum up.
-  if (v_idx >= v_dim) return;
-
-  Real sum = 0.0;
-  for (int j = sub_idx; j < M_cols; j += threads_per_element) {
-    int M_index = v_idx * M_row_stride + j * M_col_stride,
-        N_index = j * N_row_stride + v_idx * N_col_stride;
-    sum += M[M_index] * N[N_index];
+  if (v_idx < v_dim) {
+    Real sum = 0.0;
+    for (int j = sub_idx; j < M_cols; j += threads_per_element) {
+      int M_index = v_idx * M_row_stride + j * M_col_stride,
+          N_index = j * N_row_stride + v_idx * N_col_stride;
+      sum += M[M_index] * N[N_index];
+    }
+    temp_data[threadIdx.x] = sum;
   }
-  temp_data[threadIdx.x] = sum;
 
   // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data
                                      // that we want to sum up.
@@ -959,7 +959,7 @@ static void _add_diag_mat_mat(
     __syncthreads();
     num_total_threads = half_point;
   }
-  if (sub_idx == 0) {
+  if (sub_idx == 0 && v_idx < v_dim) {
     v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x];
   }
 }
@@ -1152,7 +1152,6 @@ __global__
 static void _pvec_sum(Real* v, Real* g, int dim, int size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int start = size * i;
-  if (start >= dim) return;
   int end = start + size;
   if (end > dim) end = dim;
   __shared__ Real row_data[CU1DBLOCK];
@@ -1261,6 +1260,23 @@ static void _apply_heaviside(Real* mat, MatrixDim d) {
 }
 
 
+// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index.
+// this is for no reason, really, I just happened to prefer this
+// at the time. [dan]
+template<typename Real>
+__global__
+static void _apply_signum(Real* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i * d.stride + j;
+
+  if (i < d.rows && j < d.cols) {
+    if (mat[index] > 0.0) mat[index] = 1.0;
+    else if (mat[index] < 0.0) mat[index] = -1.0;
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
@@ -2145,7 +2161,10 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
 
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
+}
 
+void cudaF_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
+  _apply_signum<<<Gr,Bl>>>(mat, d);
 }
 
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -2610,6 +2629,10 @@ void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
 }
 
+void cudaD_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
+  _apply_signum<<<Gr,Bl>>>(mat, d);
+}
+
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
   _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fc1fbae54da..57133092574 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -125,6 +125,7 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_ap
 inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
 inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim dim) { cudaF_apply_pow_abs(Gr,Bl,mat,power,include_sign, dim); }
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_signum(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) { cudaF_apply_signum(Gr,Bl,mat,dim); }
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim dim) { cudaF_apply_floor(Gr,Bl,mat,floor_val,dim); }
 inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim dim) { cudaF_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -311,6 +312,7 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_a
 inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
 inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim dim) { cudaD_apply_pow_abs(Gr,Bl,mat,power,include_sign,dim); }
 inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_heaviside(Gr,Bl,mat,dim); }
+inline void cuda_apply_signum(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) { cudaD_apply_signum(Gr,Bl,mat,dim); }
 inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim dim) { cudaD_apply_floor(Gr,Bl,mat,floor_val,dim); }
 inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim dim) { cudaD_apply_ceiling(Gr,Bl,mat,ceiling_val,dim); }
 inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index f50ded8c209..1c32de34d5c 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -298,6 +298,23 @@ template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+template<typename Real> void TestCuMatrixHeaviside(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.ApplyHeaviside();
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Heaviside" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
 
 template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
   BaseFloat time_in_secs = 0.025;
@@ -806,6 +823,8 @@ template<typename Real> void CudaMatrixSpeedTest() {
     TestCuMatrixCholesky<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixSigmoid<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixHeaviside<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuFindRowMaxId<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index eb5a268d543..a7c034a29ae 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -1895,6 +1895,7 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                             &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data());
+      CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
@@ -2016,6 +2017,24 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplySignum() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
+                 n_blocks(NumCols(), CU2DBLOCK));
+
+    cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().ApplySignum();
+  }
+}
 
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExp() {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index fd4c642ab7f..6ae233b8f56 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -337,6 +337,7 @@ class CuMatrixBase {
   ///< multiply the result by the sign of the input.
   void ApplyPowAbs(Real power, bool include_sign=false);
   void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
+  void ApplySignum(); ///< For each element, sets x = (1 if x > 0; 0 if x = 0; -1 if x < 0)
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index a32e136f62e..9b7aa97776a 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -22,7 +22,7 @@
 #include <iostream>
 #include <vector>
 #include <cstdlib>
-
+#include <cmath>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "cudamatrix/cu-matrix.h"
@@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() {
 }
 
 
-template<typename Real, typename OtherReal> 
+template<typename Real, typename OtherReal>
 static void UnitTestCuVectorCopyFromVec() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSubVector() {
   for (int32 iter = 0 ; iter < 10; iter++) {
     int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3,
@@ -97,7 +97,7 @@ static void UnitTestCuSubVector() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorMulTp() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() {
     A.SetRandn();
     TpMatrix<Real> B(dim);
     B.SetRandn();
-    
+
     CuVector<Real> C(A);
     CuTpMatrix<Real> D(B);
 
@@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() {
     B.SetRandn();
     Vector<Real> C(dim);
     C.SetRandn();
-    
+
     CuVector<Real> D(A);
     CuTpMatrix<Real> E(B);
-    CuVector<Real> F(C); 
+    CuVector<Real> F(C);
 
     A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
     D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
@@ -160,7 +160,7 @@ template<typename Real> void CuVectorUnitTestAddVec() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243;
   vec1.AddVec(alpha, vec2);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -177,7 +177,7 @@ template<typename Real> void CuVectorUnitTestAddVecCross() {
       CuVector<Real> vec1_orig(vec1);
       Real alpha = 0.43243;
       vec1.AddVec(alpha, vec2);
-  
+
       for (int32 i = 0; i < M; i++)
         AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
     } else {
@@ -198,7 +198,7 @@ template<typename Real> void CuVectorUnitTestAddVecExtra() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243, beta = 1.4321;
   vec1.AddVec(alpha, vec2, beta);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -268,6 +268,20 @@ template<typename Real> static void UnitTestCuVectorReplaceValue() {
   }
 }
 
+template<typename Real> static void UnitTestCuVectorSum() {
+  for (int32 i = 0; i < 200; i++) {
+    int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500);
+    int32 dim = RandInt(10, 12000) + start_dim + end_dim;
+    Real quiet_nan = nan("");  // this is from <cmath>.
+    Vector<BaseFloat> vec(start_dim + dim + end_dim);
+    vec.Range(0, start_dim).Set(quiet_nan);
+    vec.Range(start_dim, dim).Set(1.0);
+    vec.Range(start_dim + dim, end_dim).Set(quiet_nan);
+    BaseFloat sum = vec.Range(start_dim, dim).Sum();
+    KALDI_ASSERT(ApproxEqual(sum, dim));
+  }
+}
+
 template<typename Real> void CuVectorUnitTestInvertElements() {
   // Also tests MulElements();
   int32 M = 256 + Rand() % 100;
@@ -288,7 +302,7 @@ template<typename Real> void CuVectorUnitTestSum() {
     CuVector<Real> A(dim), ones(dim);
     A.SetRandn();
     ones.Set(1.0);
-    
+
     AssertEqual(VecVec(A, ones), A.Sum());
   }
 }
@@ -320,7 +334,7 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   }
   Matrix<Real> matrix(cu_matrix), matrix2(M, N);
   CuMatrix<Real> matrix3(M, N);
-  
+
   CuVector<Real> vector(M * N), vector2(M * N);
   vector.CopyRowsFromMat(cu_matrix);
   vector2.CopyRowsFromMat(matrix);
@@ -328,8 +342,8 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   matrix3.CopyRowsFromVec(Vector<Real>(vector2));
   Vector<Real> vector3(M * N);
   vector3.CopyRowsFromMat(cu_matrix);
-                                         
-  
+
+
   for(int32 j = 0; j < M*N; j++) {
     if (Rand() % 500 == 0) { // random small subset (it was slow)
       KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
@@ -412,7 +426,7 @@ template<typename Real> void CuVectorUnitTestNorm() {
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
 }
-               
+
 
 template<typename Real> void CuVectorUnitTestMin() {
   for (int32 p = 0; p < 5; p++) {
@@ -496,7 +510,7 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
     int32 i = cu_vector.ApplyFloor(floor);
     int32 j = vector.ApplyFloor(floor);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -517,7 +531,7 @@ template<typename Real> void CuVectorUnitTestApplyCeiling() {
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
     int32 i = cu_vector.ApplyCeiling(floor);
     int32 j = vector.ApplyCeiling(floor);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -540,7 +554,7 @@ template<typename Real> void CuVectorUnitTestApplyPow() {
     BaseFloat pow = -2 + (Rand() % 5);
     cu_vector.ApplyPow(pow);
     vector.ApplyPow(pow);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -579,7 +593,7 @@ template<typename Real> void CuVectorUnitTestAddDiagMat2() {
     cu_mat_orig.SetRandn();
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> cu_mat(cu_mat_orig, trans);
-    
+
     Vector<Real> vector(cu_vector);
     Matrix<Real> mat(cu_mat);
 
@@ -604,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() {
     MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
     MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
-    
+
     v.SetRandn();
     CuVector<Real> w(v);
 
     w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
-    
+
     {
       CuVector<Real> w2(v);
       CuMatrix<Real> MN(dimM, dimM);
@@ -669,7 +683,7 @@ template<typename Real> void CuVectorUnitTestAddSpVec() {
     CuSpMatrix<Real> mat_cu(M);
     mat_cu.SetRandn();
     SpMatrix<Real> mat(mat_cu);
-    
+
     BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5);
     dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
     dst.AddSpVec(alpha, mat, src, beta);
@@ -695,6 +709,7 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestScale<Real>();
   CuVectorUnitTestSum<Real>();
   CuVectorUnitTestInvertElements<Real>();
+  UnitTestCuVectorSum<Real>();
   CuVectorUnitTestAddRowSumMat<Real>();
   CuVectorUnitTestAddColSumMat<Real>();
   UnitTestCuVectorReplaceValue<Real>();
@@ -708,8 +723,8 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestCopyDiagFromPacked<Real>();
   CuVectorUnitTestCopyDiagFromMat<Real>();
   CuVectorUnitTestCopyCross<Real>();
-  CuVectorUnitTestCopyCross2<Real>();  
-  CuVectorUnitTestNorm<Real>();  
+  CuVectorUnitTestCopyCross2<Real>();
+  CuVectorUnitTestNorm<Real>();
   CuVectorUnitTestApplyExp<Real>();
   CuVectorUnitTestApplyLog<Real>();
   CuVectorUnitTestApplyFloor<Real>();
@@ -732,10 +747,10 @@ int main(int argc, char *argv[]) {
   const char *usage = "Usage: cu-vector-test [options]";
 
   ParseOptions po(usage);
-  std::string use_gpu = "yes";    
+  std::string use_gpu = "yes";
   po.Register("use-gpu", &use_gpu, "yes|no|optional");
   po.Read(argc, argv);
-  
+
   if (po.NumArgs() != 0) {
     po.PrintUsage();
     exit(1);
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 64f41720869..6deb3809d85 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -279,7 +279,6 @@ Real CuVectorBase<Real>::Sum() const {
       CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
       return tmp.Sum();
     } else {
-      if (dim_ == 0) return 0.0;
       CuVector<Real> tmp(1, kUndefined);
       int dimBlock(CU1DBLOCK);
       int dimGrid = 1; // only 1 block here. we have loops in each thread.
diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox
index ba42ea12370..31fa62d3389 100644
--- a/src/doc/glossary.dox
+++ b/src/doc/glossary.dox
@@ -26,7 +26,7 @@
  search function of your browser.  For convenience the definition of each
  term's section is preceded and followed by a colon, so for
  instance, typing ctrl-f ":lattice:" would take you to the section for "lattice".
- 
+
 
 <div style="text-indent: -1.5em;  padding-left: 1.5em;">
 
@@ -37,7 +37,7 @@ synonymous with a sequence of <b>transition-ids</b>.  Most of the time an
 alignment is derived from aligning the reference transcript of an utterance,
 in which case it is called a <b>forced alignment</b>.  <b>lattices</b> also
 contain alignment information as sequences of transition-ids for each word
-sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows 
+sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows
 alignments in a human-readable format.
 
 <b>:forced alignment:</b> see <b>alignment</b>.
@@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit
 This object is generally written at the start of model files.  The program
 \ref bin/show-transitions.cc "show-transitions" shows these.
 
+<b>:G.fst:</b>  The grammar FST <code>G.fst</code> which lives in the
+  <code>data/lang/</code> directory in the scripts (see \ref data_prep_lang) represents
+  the language model in a Finite State Transducer format (see www.openfst.org).
+ For the most part it is an acceptor, meaning the input and output symbols on the
+ arcs are the same, but for statistical language models with backoff, the backoff
+ arcs have the "disambiguation symbol" <code>#0</code> on the input side only.
+ For many purposes you'll want to get rid of the disambiguation symbols
+  using the command <code>fstproject --project_output=true</code>.  The disambiguation symbols
+ are needed during graph compilation to make the FST determinizable, but for things
+ like language-model rescoring you don't want them.
+
+
 </div>
 
 */
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index 9935fa52711..938321fd7b2 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -447,9 +447,10 @@ We now explain what these three scales do:
    when we add the self-loop, let the probability mass given to the self-loop be p
    and the mass given to the rest be (1-p).  We add a self-loop with log-probability
    self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other 
-   log transition probabilities
-   out of that state.  In typical topologies, the self-loop scale is the only scale
-   that matters.
+   log transition probabilities out of that state.  (Note: in the initial stage of
+   graph creation we create a graph without self-loops, and with the non-self-loop
+   transition probabilities renormalized to sum to one).  In typical topologies, the 
+   self-loop scale is the only scale that matters.
 
 The reason we feel it might make sense to apply a different probability scale to
 the self-loops versus the normal transition scale is we think they could be
diff --git a/src/doc/install.dox b/src/doc/install.dox
index 0ffb2b1220f..b40b139a8dc 100644
--- a/src/doc/install.dox
+++ b/src/doc/install.dox
@@ -29,8 +29,8 @@
   possibly including unfinished and experimental features, can
    be downloaded by typing into a shell:
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
-    cd kaldi-trunk
+    git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream
+    cd kaldi
   \endverbatim
  If you want to get updates and bug fixes you can go to some checked-out
  directory, and type
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index ee2bc11d8b9..df9f96e8430 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -32,13 +32,13 @@ namespace kaldi {
 
   The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of
   ways we can split the data by asking about, say, the left phone, the right
-  phone, the central phone, the state we're in, and so on.  
+  phone, the central phone, the state we're in, and so on.
  The algorithm we implement is similar to the standard algorithm,
  see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by
  Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally
   optimal question, i.e. the one that gives the most likelihood increase, supposing
-  we model the data on each side of the split by a single Gaussian. 
- Differences from standard implementations include added flexibility 
+  we model the data on each side of the split by a single Gaussian.
+ Differences from standard implementations include added flexibility
  about how to configure the tree roots; the ability to ask questions about the HMM-state and
  the central phone; and the fact that by default in the Kaldi scripts, the questions
  are automatically generated by a top-down binary clustering of the data, which means
@@ -50,7 +50,7 @@ namespace kaldi {
  be the tree roots.  For how to configure it using the standard scripts, see
  \ref data_prep.   In practice we generally let each tree-root correspond to a "real phone", meaning
  that we group together all word-position-dependent, tone-dependent or stress-dependent versions of
- each phone into one group that becomes a tree root.  
+ each phone into one group that becomes a tree root.
 
   The rest of this page mostly gives details at the code level of what is happening.
 
@@ -74,7 +74,7 @@ below summarizes these values:
 </table>
 
 N is the width of the context window and P is the identity of the designated
-"central phone".  Normally P is exactly in the middle of the window 
+"central phone".  Normally P is exactly in the middle of the window
 (hence the name "central-position"); for example, with N=3, we would normally
 have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and
 N=3 means two phones of left context and no right context at all.
@@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th
 phone which may or may not actually be the central phone of the context window.
 
 A vector of integers representing a typical triphone context window might be:
-\code 
-// probably not valid C++ 
+\code
+// probably not valid C++
 vector<int32> ctx_window = { 12, 15, 21 };
 \endcode
-Assuming N=3 and P=1, this would represent phone 15 with 
+Assuming N=3 and P=1, this would represent phone 15 with
 a right context of 21 and a left context of 12.  The way we handle end
 effects is using zero (which is not a valid phone because it's reserved in
 OpenFst for the epsilon meaning "no symbol"), so for instance:
-\code 
+\code
 vector<int32> ctx_window = { 12, 15, 0 };
 \endcode
 means phone 15 with a left-context of 12 and no right-context because it's the
 end of the utterance.  At the end of utterance in particular, the use of zero
 this way may be a little unexpected because the last "phone" is actually the
-subsequential symbol "$" (see \ref graph_c), but for the convenience 
+subsequential symbol "$" (see \ref graph_c), but for the convenience
 of the decision-tree code we don't
 put the subsequential symbol in these context windows, we put zero.  Note
 that if we had N=3 and P=2, the above context window would be invalid because
 its P'th element would be zero which is not a real phone; also of course,
-if we had a tree with N=1, neither of the windows above would be valid because they 
+if we had a tree with N=1, neither of the windows above would be valid because they
 are the wrong size.  In the monophone case, we would have a window like:
-\code 
+\code
 vector<int32> ctx_window = { 15 };
 \endcode
 so monophone systems are just treated as a special case of context-dependent
-systems, with a window size N of 1 and a tree that doesn't do anything very 
+systems, with a window size N of 1 and a tree that doesn't do anything very
 interesting.
 
 
@@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object).  If the program gmm-init-mono
 receives an option called --shared-phones, it will share the pdfs between
 specified sets of phones; otherwise it makes all the phones separate.
 
-After training a monophone system starting from a flat start, we take 
+After training a monophone system starting from a flat start, we take
 the monophone alignments
-and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc 
+and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc
 "acc-tree-stats") to accumulate statistics for training the tree.  This program is
 not limited to reading in monophone alignments; it works from context-dependent
 alignments too so we can build trees based on e.g. triphone alignments.
-The statistics for tree building are written to disk as the type \ref BuildTreeStatsType 
-(see \ref treei_stats).  
+The statistics for tree building are written to disk as the type \ref BuildTreeStatsType
+(see \ref treei_stats).
 The function AccumulateTreeStats() takes the values N and P, as explained in the
 previous section; the command-line programs will set these by default to 3 and
 1 respectively, but this can be overridden using the --context-width
-and --central-position options.  The program \ref acc-tree-stats.cc 
+and --central-position options.  The program \ref acc-tree-stats.cc
 "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is
 not required even if there are context-independent phones; it is just
-a mechanism to reduce the size of the statistics.  
+a mechanism to reduce the size of the statistics.
 For context-independent hones, the program will accumulate the
 corresponding statistics without the keys corresponding to the left and right phones defined
 (c.f. \ref treei_event_map).
 
 When the statistics have been
-accumulated we use the program \ref build-tree.cc "build-tree" to 
-build the tree.  This outputs the tree.  
+accumulated we use the program \ref build-tree.cc "build-tree" to
+build the tree.  This outputs the tree.
 The program \ref build-tree.cc "build-tree" requires three things:
   - The statistics (of type BuildTreeStatsType)
   - The questions config (of type Questions)
@@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics
 by the program cluster-phones.  The roots file specifies sets of phones
 that are goint to have shared roots in the decision-tree clustering process, and says
 for each phone set the following two things:
-  - "shared" or "not-shared" says whether or not there should be separate  
-    roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states,
-    in the typical case), or if the roots
-    should be shared.  If we are going to be splitting (the "split" option
-    below) we enforce that the roots should be shared.
+
+  - "shared" or "not-shared" says whether or not there should be separate roots
+    for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the
+    typical case), or if the roots should be shared.  If it says "shared" there
+    will be a single tree-root for all HMM states (e.g. all three states, in a
+    normal topology) ; if "not-shared" there would be (e.g.) three tree-roots,
+    one for each pdf-class.
+
   - "split" or "not-split" says whether or not the decision tree splitting
     should actually be done for the roots in question (for silence, we
-    typically don't split).
+    typically don't split).  If the line says "split" (the normal case) then
+    we do the decision tree splitting.  If it says "not-split" then no splitting
+    is done and the roots are left un-split.
 
-Be careful because the notation is a bit tricky.  The "shared" on the line of
-the roots file is about whether we will share all the 3 HMM-states of the phone
-in a single tree root.  But we will always share together the roots of all the phones that
-appear on a single lines of the roots file.  This is not configurable via these
-strings because if you don't want to share them, you can just put them on
-separate lines of the roots file. 
+
+The following will clarify some aspects of how this works:
+
+ - If we say "shared split", then
+   even though there is one root node for all three HMM-states, the different
+   HMM states can still get different leaves because the tree can ask questions
+   about the pdf-class as well as about phonetic context.
+
+ - We always share together the roots of all the phones that appear on a single
+   lines of the roots file.  This is not configurable via these strings because
+   if you don't want to share the phones' roots, you can just put them on
+   separate lines of the roots file.
 
 Below is an example of a roots file; this assumes that phone 1 is silence
 and all the other phones have separate roots.
@@ -185,14 +196,14 @@ shared split 3
 ...
 shared split 28
 \endverbatim
-Having multiple phones on the same line is most useful when we have things like position and 
+Having multiple phones on the same line is most useful when we have things like position and
 stress-dependent phones; in this case each "real" phone would correspond
 to a set of integer phone ids.  In that case we share the roots for all
 versions of a particular underlying phone.  Below is an example of a roots file
-for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; 
+for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form;
 it would have to be converted to integer form before being read by Kalid):
 \verbatim
-not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S 
+not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S
 shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S
 shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S
 shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S
@@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each
 For instance, in this case, if the phone AY was seen in at least some combination of stress and
 word-position, we would be OK.
 
-In this example, we have various word-position-dependent variants of silence and so on.  
+In this example, we have various word-position-dependent variants of silence and so on.
 In this example they will all share their pdf's because they are on the same line and are
 "not-split"-- but they may have different transition parameters.  In fact, most of these
 variants of silence would never be used as silence never appears inside words; this is for
@@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali".
  pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR
  system).  They are originally assigned when the tree is first built.  Depending
  how the tree is built, it may or may not be possible to say, for each pdf-id, which phone
- it corresponds to.  
+ it corresponds to.
 
 \section tree_ctxdep Context dependency objects
 
  The ContextDependencyInterface object is a virtual base-class for the
  tree that specifies how it interacts with the graph-building code.  This
- interface contains only four functions: 
+ interface contains only four functions:
     - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns
         the value of N (context-width) that the tree requires.
     - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns
@@ -264,8 +275,8 @@ else
 \endcode
 
 The only class that currently inherits from ContextDependencyInterface
-is the class ContextDependency, which has marginally richer interface; 
-the only important addition is the function \ref ContextDependency::GetPdfInfo 
+is the class ContextDependency, which has marginally richer interface;
+the only important addition is the function \ref ContextDependency::GetPdfInfo
 "GetPdfInfo" which is used by the TransitionModel class to work out which
 phones a particular pdf can possibly correspond to (this function could
 be emulated given only the interface of ContextDependencyInterface, by
@@ -274,7 +285,7 @@ enumerating all contexts).
 The ContextDependency object is actually a fairly thin wrapper for the
 EventMap object; see \ref tree_internals.  We wanted to hide
 the actual implementation of the tree as much as possible to make it
-easy to refactor the code later if needed. 
+easy to refactor the code later if needed.
 
 \section tree_example An example of a decision tree
 
@@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format.
 In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that
 splits on key 1, which is the central phone.  In square brackets are a contiguous range
 of phone-ids.  As it happens, these don't represent a question, but are just a way of
-splitting on phones so we can get to the "real" decision trees which are per phone.  
+splitting on phones so we can get to the "real" decision trees which are per phone.
 The issue is that this tree was built with "shared roots", so there are various phone-ids,
 corresponding to different word-position-and-stress-marked versions of the same phone,
 that share the root.  We can't use a TableEventMap (TE) at the top level of the tree,
 or we'd have to repeat each decision tree several times (since the EventMap is a pure
-tree, not a general graph, it has no mechanism for pointers to be "shared").  
-The next few instances of the "SE" label are also part of this "quasi-tree" which 
+tree, not a general graph, it has no mechanism for pointers to be "shared").
+The next few instances of the "SE" label are also part of this "quasi-tree" which
 is initially splitting on the central phone (as we go down this file we are going
 deeper into the tree; notice that the braces "{" are opening but not yet closing).
 Then we have the string
 "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap
-on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.  
+on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.
 The values represent the five pdf-ids
 for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these
 three non-speech phones (only the transition matrix is specific to each non-speech phone).
@@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class (
 has value 0 (i.e. the leftmost HMM-state).  Assuming the answer is "yes", the next question
 is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
 forms of the phone "M" (a rather unintuitive question to ask, since we're
-in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is 
-a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if 
+in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
+a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
 no, 696 ("CE 696").
 \verbatim
 s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
@@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4
 \endverbatim
 
 Below is a simpler example: the monophone tree from the Resource Management
-recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").  
-The key "0" is the phone-position of zero which represents the central (and only) phone 
+recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").
+The key "0" is the phone-position of zero which represents the central (and only) phone
 since the context width (N) is 1.  The number of entries in the table is 49
 (in this case, the number of phones plus one).  The
 first EventMap in the table (index zero) is NULL, because there is no phone with
@@ -375,11 +386,11 @@ index zero.  The next one is a TableEventMap with three elements, corresponding
 to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )".
 \verbatim
 s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5
-ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) 
-TE -1 3 ( CE 3 CE 4 CE 5 ) 
-TE -1 3 ( CE 6 CE 7 CE 8 ) 
-TE -1 3 ( CE 9 CE 10 CE 11 ) 
-TE -1 3 ( CE 12 CE 13 CE 14 ) 
+ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 )
+TE -1 3 ( CE 3 CE 4 CE 5 )
+TE -1 3 ( CE 6 CE 7 CE 8 )
+TE -1 3 ( CE 9 CE 10 CE 11 )
+TE -1 3 ( CE 12 CE 13 CE 14 )
 \endverbatim
 
 
@@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols).  In the graph, as always,
 these are represented by integer labels.  We use an object that, in code
 and in filenames, is generally called ilabel_info.  The ilabel_info object
 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context.
-As with many other Kaldi types, ilabel_info is a generic (STL) type but 
-we use a consistent variable name 
+As with many other Kaldi types, ilabel_info is a generic (STL) type but
+we use a consistent variable name
 to make it identifiable.  It is of the following type:
 \code
  std::vector<std::vector<int32> > ilabel_info;
@@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above,
 \ref tree_window).  For example, suppose symbol 1500 is phone
 30 with a right-context of 12 and a left-context of 4, we would
 have
-\code 
+\code
  // not valid C++
  ilabel_info[1500] == { 4, 30, 12 };
 \endcode
@@ -410,14 +421,14 @@ In the monophone case, we would have things like:
 \code
  ilabel_info[30] == { 28 };
 \endcode
-There are special cases to deal with disambiguation symbols (see 
-\ref graph_disambig or the 
+There are special cases to deal with disambiguation symbols (see
+\ref graph_disambig or the
 Springer Handbook paper referenced above for an explanation of what these
 are).  If an ilabel_info entry corresponds to a disambiguation symbol,
 we put in it the negative of the symbol-table entry of the disambiguation
 symbol (note that this is not the same as the number of the printed form
-of the disambiguation symbol as in #0, #1, #2 etc., it is the number 
-corresponding to it in a symbol-table file, which in our current scripts is 
+of the disambiguation symbol as in #0, #1, #2 etc., it is the number
+corresponding to it in a symbol-table file, which in our current scripts is
 called phones_disambig.txt).  For example,
 \code
  ilabel_info[5] == { -42 };
@@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be
 given a list of disambiguation symbols in order to be able to distinguish them from
 real phones in the monophone case.  There are two additional special cases:
 we have
-\code 
+\code
  ilabel_info[0] == { }; // epsilon
  ilabel_info[1] == { 0 }; // disambig symbol #-1;
  // we use symbol 1, but don't consider this hardwired.
diff --git a/src/feat/signal.cc b/src/feat/signal.cc
index 19b876989c2..e8fbb0b84cf 100644
--- a/src/feat/signal.cc
+++ b/src/feat/signal.cc
@@ -34,7 +34,7 @@ void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
 void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
   int32 signal_length = signal->Dim();
   int32 filter_length = filter.Dim();
-  Vector<float> signal_padded(signal_length + filter_length - 1);
+  Vector<BaseFloat> signal_padded(signal_length + filter_length - 1);
   signal_padded.SetZero();
   for (int32 i = 0; i < signal_length; i++) {
     for (int32 j = 0; j < filter_length; j++) {
@@ -54,11 +54,11 @@ void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat>
 
   SplitRadixRealFft<BaseFloat> srfft(fft_length);
 
-  Vector<float> filter_padded(fft_length);
+  Vector<BaseFloat> filter_padded(fft_length);
   filter_padded.Range(0, filter_length).CopyFromVec(filter);
   srfft.Compute(filter_padded.Data(), true);
 
-  Vector<float> signal_padded(fft_length);
+  Vector<BaseFloat> signal_padded(fft_length);
   signal_padded.Range(0, signal_length).CopyFromVec(*signal);
   srfft.Compute(signal_padded.Data(), true);
 
@@ -83,13 +83,13 @@ void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFl
   KALDI_VLOG(1) << "Block size is " << block_length;
   SplitRadixRealFft<BaseFloat> srfft(fft_length);
 
-  Vector<float> filter_padded(fft_length);
+  Vector<BaseFloat> filter_padded(fft_length);
   filter_padded.Range(0, filter_length).CopyFromVec(filter);
   srfft.Compute(filter_padded.Data(), true);
 
-  Vector<float> temp_pad(filter_length - 1);
+  Vector<BaseFloat> temp_pad(filter_length - 1);
   temp_pad.SetZero();
-  Vector<float> signal_block_padded(fft_length);
+  Vector<BaseFloat> signal_block_padded(fft_length);
 
   for (int32 po = 0; po < signal_length; po += block_length) {
     // get a block of the signal
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 9843e7bbd4b..a2ad0032815 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
     process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \
     compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \
     wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \
-    concat-feats
+    concat-feats vector-to-feat extract-column
 
 OBJFILES =
 
diff --git a/src/featbin/extract-column.cc b/src/featbin/extract-column.cc
new file mode 100644
index 00000000000..2bbf6b17235
--- /dev/null
+++ b/src/featbin/extract-column.cc
@@ -0,0 +1,82 @@
+// featbin/extract-column.cc
+
+// Copyright 2015  Vimal Manohar (Johns Hopkins University)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+
+    const char *usage =
+        "Extract a column out of a matrix. \n"
+        "This is most useful to extract log-energies \n"
+        "from feature files\n"
+        "\n"
+        "Usage: extract-column [options] --column-index=<col-index> <features-rspecifier> <vector-wspecifier>\n"
+        "  e.g. extract-column ark:feats-in.ark ark:energies.ark\n"
+        "See also: select-feats, subset-feats, subsample-feats, extract-rows\n";
+    
+    ParseOptions po(usage);
+
+    int32 column_index = 0;
+    
+    po.Register("column-index", &column_index,
+                "Index of column to extract");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    string feat_rspecifier = po.GetArg(1);
+    string vector_wspecifier = po.GetArg(2);
+
+    SequentialBaseFloatMatrixReader reader(feat_rspecifier);
+    BaseFloatVectorWriter writer(vector_wspecifier);
+
+    int32 num_done = 0, num_err = 0;
+
+    string line;
+
+    for (; !reader.Done(); reader.Next(), num_done++) {
+      const Matrix<BaseFloat>& feats(reader.Value());
+      Vector<BaseFloat> col(feats.NumRows());
+      if (column_index >= feats.NumCols()) {
+        KALDI_ERR << "Column index " << column_index << " is "
+                  << "not less than number of columns " << feats.NumCols();
+      }
+      col.CopyColFromMat(feats, column_index);
+      writer.Write(reader.Key(), col);
+    }
+
+    KALDI_LOG << "Processed " << num_done << " segments successfully; "
+              << "errors on " << num_err;
+
+    return (num_done > 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/featbin/vector-to-feat.cc b/src/featbin/vector-to-feat.cc
new file mode 100644
index 00000000000..5e98cf95a1c
--- /dev/null
+++ b/src/featbin/vector-to-feat.cc
@@ -0,0 +1,99 @@
+// featbin/vector-to-feat.cc
+
+// Copyright 2015   Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Convert a vector into a single feature so that it can be appended \n"
+        "to other feature matrices\n"
+        "Usage: vector-to-feats <vector-rspecifier> <feature-wspecifier>\n"
+        "or:   vector-to-feats <vector-rxfilename> <feature-wxfilename>\n"
+        "e.g.: vector-to-feats scp:weights.scp ark:weight_feats.ark\n"
+        " or: vector-to-feats weight_vec feat_mat\n"
+        "See also: copy-feats, copy-matrix, paste-feats, \n"
+        "subsample-feats, splice-feats\n";
+
+    ParseOptions po(usage);
+    bool compress = false, binary = true;
+    
+    po.Register("binary", &binary, "Binary-mode output (not relevant if writing "
+                "to archive)");
+    po.Register("compress", &compress, "If true, write output in compressed form"
+                "(only currently supported for wxfilename, i.e. archive/script,"
+                "output)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    int32 num_done = 0;
+    
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+      std::string vector_rspecifier = po.GetArg(1);
+      std::string feature_wspecifier = po.GetArg(2);
+
+      SequentialBaseFloatVectorReader vector_reader(vector_rspecifier);
+      BaseFloatMatrixWriter feat_writer(feature_wspecifier);
+      CompressedMatrixWriter compressed_feat_writer(feature_wspecifier);
+
+      for (; !vector_reader.Done(); vector_reader.Next(), ++num_done) {
+        const Vector<BaseFloat> &vec = vector_reader.Value();
+        Matrix<BaseFloat> feat(vec.Dim(), 1);
+        feat.CopyColFromVec(vec, 0);
+
+        if (!compress)
+          feat_writer.Write(vector_reader.Key(), feat);
+        else 
+          compressed_feat_writer.Write(vector_reader.Key(), CompressedMatrix(feat));
+      }
+      KALDI_LOG  << "Converted " << num_done << " vectors into features";
+      return (num_done != 0 ? 0 : 1);
+    } 
+
+    KALDI_ASSERT(!compress && "Compression not yet supported for single files");
+
+    std::string vector_rxfilename = po.GetArg(1),
+                feature_wxfilename = po.GetArg(2);
+
+    Vector<BaseFloat> vec;
+    ReadKaldiObject(vector_rxfilename, &vec);
+
+    Matrix<BaseFloat> feat(vec.Dim(), 1);
+    feat.CopyColFromVec(vec, 0);
+
+    WriteKaldiObject(feat, feature_wxfilename, binary);
+
+    KALDI_LOG << "Converted vector " << PrintableRxfilename(vector_rxfilename)
+              << " to " << PrintableWxfilename(feature_wxfilename);
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 25acf48a7d1..4e5cbd45282 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -429,18 +429,6 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
   }
 }
 
-// comparator object that can be used to sort from greatest to
-// least posterior.
-struct CompareReverseSecond {
-  // view this as an "<" operator used for sorting, except it behaves like
-  // a ">" operator on the .second field of the pair because we want the
-  // sort to be in reverse order (greatest to least) on posterior.
-  bool operator() (const std::pair<int32, BaseFloat> &a,
-                   const std::pair<int32, BaseFloat> &b) {
-    return (a.second > b.second);
-  }
-};
-
 BaseFloat VectorToPosteriorEntry(
     const VectorBase<BaseFloat> &log_likes,
     int32 num_gselect,
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index 18bbd65a86a..4f5896da7c6 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -155,6 +155,18 @@ int32 MergePosteriors(const Posterior &post1,
                       bool drop_frames,
                       Posterior *post);
 
+// comparator object that can be used to sort from greatest to
+// least posterior.
+struct CompareReverseSecond {
+  // view this as an "<" operator used for sorting, except it behaves like
+  // a ">" operator on the .second field of the pair because we want the
+  // sort to be in reverse order (greatest to least) on posterior.
+  bool operator() (const std::pair<int32, BaseFloat> &a,
+                   const std::pair<int32, BaseFloat> &b) {
+    return (a.second > b.second);
+  }
+};
+
 /// Given a vector of log-likelihoods (typically of Gaussians in a GMM
 /// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior
 /// 0 <= min_post < 1, it gets the posterior for each element of log-likes
diff --git a/src/lat/Makefile b/src/lat/Makefile
index ef9166fea12..bb36694f12e 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -6,7 +6,8 @@ include ../kaldi.mk
 EXTRA_CXXFLAGS += -Wno-sign-compare
 
 TESTFILES = kaldi-lattice-test push-lattice-test minimize-lattice-test \
-      determinize-lattice-pruned-test word-align-lattice-lexicon-test
+      determinize-lattice-pruned-test word-align-lattice-lexicon-test \
+			lattice-functions-test
 
 OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 	   phone-align-lattice.o word-align-lattice-lexicon.o sausages.o \
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 0ea66712eda..fcb0039a6a3 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -6,6 +6,7 @@
 //                2013  Cisco Systems (author: Neha Agrawal) [code modified
 //                      from original code in ../gmmbin/gmm-rescore-lattice.cc]
 //                2014  Guoguo Chen
+//           2014-2015  Vimal Manohar
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -22,17 +23,22 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-
+#include <algorithm>
 #include "lat/lattice-functions.h"
 #include "hmm/transition-model.h"
 #include "util/stl-utils.h"
 #include "base/kaldi-math.h"
 #include "hmm/hmm-utils.h"
+#include "hmm/posterior.h"
+#include "base/kaldi-types-extra.h"
 
 namespace kaldi {
 using std::map;
 using std::vector;
 
+typedef SignedLogReal<double> SignedLogDouble;
+typedef SignedLogReal<BaseFloat> SignedLogBaseFloat;
+
 int32 LatticeStateTimes(const Lattice &lat, vector<int32> *times) {
   if (!lat.Properties(fst::kTopSorted, true))
     KALDI_ERR << "Input lattice must be topologically sorted.";
@@ -270,7 +276,9 @@ template bool PruneLattice(BaseFloat beam, CompactLattice *lat);
 
 
 BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
-                                 double *acoustic_like_sum) {
+                                 double *acoustic_like_sum,
+                                 std::vector<double> *out_alpha,
+                                 std::vector<double> *out_beta) {
   // Note, Posterior is defined as follows:  Indexed [frame], then a list
   // of (transition-id, posterior-probability) pairs.
   // typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior;
@@ -289,22 +297,35 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
   int32 num_states = lat.NumStates();
   vector<int32> state_times;
   int32 max_time = LatticeStateTimes(lat, &state_times);
-  std::vector<double> alpha(num_states, kLogZeroDouble);
-  std::vector<double> &beta(alpha); // we re-use the same memory for
-  // this, but it's semantically distinct so we name it differently.
+  
+  std::vector<double> *alpha, *beta;
+  if (out_alpha && out_beta) {
+    alpha = out_alpha;
+    beta = out_beta;
+    alpha->clear();
+    alpha->resize(num_states, kLogZeroDouble);
+    beta->clear();
+    beta->resize(num_states, kLogZeroDouble);
+  } else {
+    alpha = new std::vector<double>(num_states, kLogZeroDouble);
+    beta = alpha;
+    // we re-use the same memory for
+    // this, but it's semantically distinct so we name it differently.
+  }
+
   double tot_forward_prob = kLogZeroDouble;
 
   post->clear();
   post->resize(max_time);
 
-  alpha[0] = 0.0;
+  (*alpha)[0] = 0.0;
   // Propagate alphas forward.
   for (StateId s = 0; s < num_states; s++) {
-    double this_alpha = alpha[s];
+    double this_alpha = (*alpha)[s];
     for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       double arc_like = -ConvertToCost(arc.weight);
-      alpha[arc.nextstate] = LogAdd(alpha[arc.nextstate], this_alpha + arc_like);
+      (*alpha)[arc.nextstate] = LogAdd((*alpha)[arc.nextstate], this_alpha + arc_like);
     }
     Weight f = lat.Final(s);
     if (f != Weight::Zero()) {
@@ -320,13 +341,13 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
     for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
       const Arc &arc = aiter.Value();
       double arc_like = -ConvertToCost(arc.weight),
-          arc_beta = beta[arc.nextstate] + arc_like;
+          arc_beta = (*beta)[arc.nextstate] + arc_like;
       this_beta = LogAdd(this_beta, arc_beta);
       int32 transition_id = arc.ilabel;
 
       // The following "if" is an optimization to avoid un-needed exp().
       if (transition_id != 0 || acoustic_like_sum != NULL) {
-        double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob);
+        double posterior = Exp((*alpha)[s] + arc_beta - tot_forward_prob);
 
         if (transition_id != 0) // Arc has a transition-id on it [not epsilon]
           (*post)[state_times[s]].push_back(std::make_pair(transition_id,
@@ -337,12 +358,12 @@ BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
     }
     if (acoustic_like_sum != NULL && f != Weight::Zero()) {
       double final_logprob = - ConvertToCost(f),
-          posterior = Exp(alpha[s] + final_logprob - tot_forward_prob);
+          posterior = Exp((*alpha)[s] + final_logprob - tot_forward_prob);
       *acoustic_like_sum -= posterior * f.Value2();
     }
-    beta[s] = this_beta;
+    (*beta)[s] = this_beta;
   }
-  double tot_backward_prob = beta[0];
+  double tot_backward_prob = (*beta)[0];
   if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) {
     KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob
               << ", while total backward probability = " << tot_backward_prob;
@@ -398,72 +419,6 @@ void ConvertLatticeToPhones(const TransitionModel &trans,
 }
 
 
-static inline double LogAddOrMax(bool viterbi, double a, double b) {
-  if (viterbi)
-    return std::max(a, b);
-  else
-    return LogAdd(a, b);
-}
-
-// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
-// best-path negated cost) Note: in either case, the alphas and betas are
-// negated costs.  Requires that lat be topologically sorted.  This code
-// will work for either CompactLattice or Latice.
-template<typename LatticeType>
-static double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
-                                           bool viterbi,
-                                           vector<double> *alpha,
-                                           vector<double> *beta) {
-  typedef typename LatticeType::Arc Arc;
-  typedef typename Arc::Weight Weight;
-  typedef typename Arc::StateId StateId;
-
-  StateId num_states = lat.NumStates();
-  KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted);
-  KALDI_ASSERT(lat.Start() == 0);
-  alpha->resize(num_states, kLogZeroDouble);
-  beta->resize(num_states, kLogZeroDouble);
-
-  double tot_forward_prob = kLogZeroDouble;
-  (*alpha)[0] = 0.0;
-  // Propagate alphas forward.
-  for (StateId s = 0; s < num_states; s++) {
-    double this_alpha = (*alpha)[s];
-    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
-         aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      double arc_like = -ConvertToCost(arc.weight);
-      (*alpha)[arc.nextstate] = LogAddOrMax(viterbi, (*alpha)[arc.nextstate],
-                                                this_alpha + arc_like);
-    }
-    Weight f = lat.Final(s);
-    if (f != Weight::Zero()) {
-      double final_like = this_alpha - ConvertToCost(f);
-      tot_forward_prob = LogAddOrMax(viterbi, tot_forward_prob, final_like);
-    }
-  }
-  for (StateId s = num_states-1; s >= 0; s--) { // it's guaranteed signed.
-    double this_beta = -ConvertToCost(lat.Final(s));
-    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
-         aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      double arc_like = -ConvertToCost(arc.weight),
-          arc_beta = (*beta)[arc.nextstate] + arc_like;
-      this_beta = LogAddOrMax(viterbi, this_beta, arc_beta);
-    }
-    (*beta)[s] = this_beta;
-  }
-  double tot_backward_prob = (*beta)[lat.Start()];
-  if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) {
-    KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob
-               << ", while total backward probability = " << tot_backward_prob;
-  }
-  // Split the difference when returning... they should be the same.
-  return 0.5 * (tot_backward_prob + tot_forward_prob);
-}
-
-
-
 /// This is used in CompactLatticeLimitDepth.
 struct LatticeArcRecord {
   BaseFloat logprob; // logprob <= 0 is the best Viterbi logprob of this arc,
@@ -736,7 +691,6 @@ bool LatticeBoost(const TransitionModel &trans,
 }
 
 
-
 BaseFloat LatticeForwardBackwardMpeVariants(
     const TransitionModel &trans,
     const std::vector<int32> &silence_phones,
@@ -831,19 +785,24 @@ BaseFloat LatticeForwardBackwardMpeVariants(
         if (!is_mpfe) { // smbr.
           int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
               ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
-          if (!one_silence_class)  // old behavior
+          if (!one_silence_class) {  // old behavior
+            //frame_acc = (pdf == ref_pdf && !ref_phone_is_sil) ? 1.0 : 0.0;
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
-          else
+          } else
             frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0;
         } else {
           if (!one_silence_class)  // old behavior
-            frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0;
+            frame_acc = (phone == ref_phone && !ref_phone_is_sil) ? 1.0 : 0.0;
           else
             frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0;
         }
       }
       double arc_scale = Exp(alpha[s] + arc_like - alpha[arc.nextstate]);
       alpha_smbr[arc.nextstate] += arc_scale * (alpha_smbr[s] + frame_acc);
+      KALDI_VLOG(10) << "Alpha SMBR for state " << arc.nextstate 
+        << " reached from state " << s
+        << " at time " << state_times[s] << " is "
+        << alpha_smbr[s];
     }
     Weight f = lat.Final(s);
     if (f != Weight::Zero()) {
@@ -875,13 +834,14 @@ BaseFloat LatticeForwardBackwardMpeVariants(
         if (!is_mpfe) { // smbr.
           int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
               ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
-          if (!one_silence_class)  // old behavior
+          if (!one_silence_class) { // old behavior
+            //frame_acc = (pdf == ref_pdf && !ref_phone_is_sil) ? 1.0 : 0.0;
             frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
-          else
+          } else
             frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0;
         } else {
           if (!one_silence_class)  // old behavior
-            frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0;
+            frame_acc = (phone == ref_phone && !ref_phone_is_sil) ? 1.0 : 0.0;
           else
             frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0;
         }
@@ -892,8 +852,13 @@ BaseFloat LatticeForwardBackwardMpeVariants(
       // i.e., paths don't survive to the final state
       if (KALDI_ISNAN(arc_scale)) arc_scale = 0;
       beta_smbr[s] += arc_scale * (beta_smbr[arc.nextstate] + frame_acc);
+      KALDI_VLOG(10) << "Beta SMBR for state " << s 
+                     << " going to state " << arc.nextstate
+                     << " at time " << state_times[s] << " is "
+                     << beta_smbr[s];
 
       if (transition_id != 0) { // Arc has a transition-id on it [not epsilon]
+        // Get gradient wrt acoustic log-likelihood
         double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob);
         double acc_diff = alpha_smbr[s] + frame_acc + beta_smbr[arc.nextstate]
                                - tot_forward_score;
@@ -918,6 +883,491 @@ BaseFloat LatticeForwardBackwardMpeVariants(
   return tot_forward_score;
 }
 
+
+
+BaseFloat LatticeForwardBackwardEmpeVariants(
+    const TransitionModel &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    const Posterior *num_post,
+    const Lattice *num_lat,
+    std::string criterion,
+    bool one_silence_class,
+    BaseFloat deletion_penalty,
+    Posterior *post,
+    BaseFloat weight_threshold,
+    const std::vector<BaseFloat> *weights) {
+  using namespace fst;
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  KALDI_ASSERT(criterion == "empfe" || criterion == "esmbr");
+  
+  if (lat.Properties(fst::kTopSorted, true) == 0)
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+  
+  vector<int32> state_times;
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+ 
+  KALDI_ASSERT(num_ali.size() == max_time);
+  std::vector<double> alpha, beta;
+
+  Posterior num_post_computed;
+
+  if ((criterion == "smbr" || criterion == "mpfe") && num_lat == NULL && num_post == NULL) {
+    // Using numerator alignment
+    KALDI_VLOG(4) << "Computing for " << criterion 
+                  << " criterion using numerator alignment";
+    AlignmentToPosterior(num_ali, &num_post_computed);
+    ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta);
+  } else if (num_lat) {
+    // Using numerator lattice
+    KALDI_VLOG(4) << "Computing for " << criterion 
+                  << " criterion using numerator lattice";
+    LatticeForwardBackward(*num_lat, &num_post_computed, NULL);
+    ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta);
+  } else if (num_post) {
+    // Using numerator posteriors
+    KALDI_VLOG(4) << "Computing for " << criterion 
+                  << " criterion using numerator posteriors";
+    num_post_computed = *num_post;
+    ComputeLatticeAlphasAndBetas(lat, false, &alpha, &beta);
+  } else {
+    // Using denominator lattice
+    KALDI_VLOG(4) << "Computing for " << criterion 
+                  << " criterion using denominator lattice";
+    LatticeForwardBackward(lat, &num_post_computed, 
+                           NULL, &alpha, &beta);
+  }
+  
+  // Now combine any posteriors with the same transition-id.
+  for (int32 t = 0; t < max_time; t++)
+    MergePairVectorSumming(&(num_post_computed[t]));
+
+  // Remove frames with max numerator posterior < weight_threshold
+  for (size_t i = 0; i < max_time; i++) {
+    std::vector<std::pair<int32, BaseFloat> > &post_i = num_post_computed[i];
+    std::vector<std::pair<int32, BaseFloat> >::iterator it = 
+      std::min_element(post_i.begin(), post_i.end(), CompareReverseSecond());
+    if (it->second < weight_threshold) 
+      num_post_computed[i].clear();
+  }
+
+  BaseFloat tot_forward_score = 
+    LatticeForwardBackwardEmpeVariantsInternal(trans, silence_phones, lat,
+                        num_ali, num_post_computed, alpha, beta, criterion, 
+                        one_silence_class, deletion_penalty, post, weights);
+
+  return tot_forward_score;
+}
+
+BaseFloat LatticeForwardBackwardEmpeVariantsInternal(
+    const TransitionModel &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    const Posterior &num_post,
+    const std::vector<double> &alpha,
+    const std::vector<double> &beta,
+    std::string criterion,
+    bool one_silence_class,
+    BaseFloat deletion_penalty,
+    Posterior *post,
+    const std::vector<BaseFloat> *weights = NULL) {
+  using namespace fst;
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  KALDI_ASSERT(criterion == "empfe" || criterion == "esmbr");
+  bool is_mpfe = (criterion == "empfe");
+
+  if (lat.Properties(fst::kTopSorted, true) == 0)
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+  
+  int32 num_states = lat.NumStates();
+  vector<int32> state_times;
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+
+  KALDI_ASSERT(alpha.size() == num_states && beta.size() == num_states);
+
+  std::vector<double> alpha_smbr(num_states, 0), //forward variable for sMBR
+                      beta_smbr(num_states, 0); //backward variable for sMBR
+  
+  post->clear();
+  post->resize(max_time);
+  
+  double tot_forward_prob = beta[0];
+  double tot_forward_score = 0;
+
+  alpha_smbr[0] = 0.0;
+  // Second Pass Forward, calculate forward for EMPFE/ESMBR
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = alpha[s];
+   for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      double frame_acc = 0.0;
+      if (arc.ilabel != 0) {
+        int32 cur_time = state_times[s];
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel);
+        int32 pdf = trans.TransitionIdToPdf(arc.ilabel);
+        bool phone_is_sil = std::binary_search(silence_phones.begin(),
+                                               silence_phones.end(),
+                                               phone);
+
+        // Go through the numerator lattice
+        for (std::vector<std::pair<int32,BaseFloat> >::const_iterator it = num_post[cur_time].begin(); 
+            it != num_post[cur_time].end(); ++it) {
+          int32 ref_phone = trans.TransitionIdToPhone(it->first);
+          BaseFloat weight = it->second;
+          
+          bool ref_phone_is_sil = std::binary_search(silence_phones.begin(),
+                                                     silence_phones.end(),
+                                                     ref_phone),
+               both_sil = phone_is_sil && ref_phone_is_sil;
+
+          if (!is_mpfe) { // smbr.
+            int32 ref_pdf = trans.TransitionIdToPdf(it->first);
+            if (!one_silence_class)  // old behavior
+              //frame_acc += (pdf == ref_pdf && !phone_is_sil) ? weight : 0.0;
+              // fixed old behavior
+              frame_acc += (pdf == ref_pdf && !ref_phone_is_sil) ? weight : 0.0;
+            else
+              frame_acc += (pdf == ref_pdf || both_sil) ? weight : 0.0;
+          } else {
+            if (!one_silence_class)  // old behavior
+              // frame_acc += (phone == ref_phone && !phone_is_sil) ? weight : 0.0;
+              // fixed old behavior
+              frame_acc += (phone == ref_phone && !ref_phone_is_sil) ? weight : 0.0;
+            else
+              frame_acc += (phone == ref_phone || both_sil) ? weight : 0.0;
+          }
+        }
+
+        if (deletion_penalty > 0) {
+          int32 ali_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+          bool ali_is_sil = std::binary_search(silence_phones.begin(),
+              silence_phones.end(),
+              ali_phone);
+          // Add extra score to a path if it is not a deletion
+          // (deletion: path has silence and best path has non-silence)
+          frame_acc += !(!ali_is_sil && phone_is_sil) ? deletion_penalty : 0.0;
+        }
+      }
+
+      if (weights != NULL) 
+        frame_acc *= (*weights)[state_times[s]]; 
+
+      double arc_scale = Exp(alpha[s] + arc_like - alpha[arc.nextstate]);
+      alpha_smbr[arc.nextstate] += arc_scale * (alpha_smbr[s] + frame_acc);
+      KALDI_VLOG(10) << "Alpha SMBR for state " << arc.nextstate 
+        << " reached from state " << s
+        << " at time " << state_times[s] << " is "
+        << alpha_smbr[s];
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - (f.Value1() + f.Value2());
+      double arc_scale = Exp(final_like - tot_forward_prob);
+      tot_forward_score += arc_scale * alpha_smbr[s];
+      KALDI_ASSERT(state_times[s] == max_time &&
+                   "Lattice is inconsistent (final-prob not at max_time)");
+    }
+  }
+  
+  // Second Pass Backward, collect EMPFE style posteriors
+  for (StateId s = num_states-1; s >= 0; s--) {
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+             arc_beta = beta[arc.nextstate] + arc_like;
+      int32 transition_id = arc.ilabel;
+      double frame_acc = 0.0;
+      if (arc.ilabel != 0) {
+        int32 cur_time = state_times[s];
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel);
+        int32 pdf = trans.TransitionIdToPdf(arc.ilabel);
+        bool phone_is_sil = std::binary_search(silence_phones.begin(),
+                                               silence_phones.end(), phone);
+        for (std::vector<std::pair<int32, BaseFloat> >::const_iterator it = num_post[cur_time].begin();
+            it != num_post[cur_time].end(); ++it) {
+          int32 ref_phone = trans.TransitionIdToPhone(it->first);
+          BaseFloat weight = it->second;
+          bool ref_phone_is_sil = std::binary_search(silence_phones.begin(),
+                                                     silence_phones.end(),
+                                                     ref_phone),
+               both_sil = phone_is_sil && ref_phone_is_sil;
+          if (!is_mpfe) { // smbr.
+            int32 ref_pdf = trans.TransitionIdToPdf(it->first);
+            if (!one_silence_class)  // old behavior
+              // frame_acc += (pdf == ref_pdf && !phone_is_sil) ? weight : 0.0;
+              // fixed old behavior
+              frame_acc += (pdf == ref_pdf && !ref_phone_is_sil) ? weight : 0.0;
+            else
+              frame_acc += (pdf == ref_pdf || both_sil) ? weight : 0.0;
+          } else {
+            if (!one_silence_class)  // old behavior
+              // frame_acc += (phone == ref_phone && !phone_is_sil) ? weight : 0.0;
+              // fixed old behavior
+              frame_acc += (phone == ref_phone && !ref_phone_is_sil) ? weight : 0.0;
+            else
+              frame_acc += (phone == ref_phone || both_sil) ? weight : 0.0;
+          }
+        }
+
+        if (deletion_penalty > 0.0) {
+          int32 ali_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+          bool ali_is_sil = std::binary_search(silence_phones.begin(),
+              silence_phones.end(),
+              ali_phone);
+          // Add extra score to a path if it is not a deletion
+          // (deletion: path has silence and best path has non-silence)
+          frame_acc += !(!ali_is_sil && phone_is_sil) ? deletion_penalty : 0.0;
+        }
+      }
+      
+      if (weights != NULL) 
+        frame_acc *= (*weights)[state_times[s]]; 
+
+
+      double arc_scale = Exp(beta[arc.nextstate] + arc_like - beta[s]);
+      // check arc_scale NAN,
+      // this is to prevent partial paths in Lattices
+      // i.e., paths don't survive to the final state
+      if (KALDI_ISNAN(arc_scale)) arc_scale = 0;
+      beta_smbr[s] += arc_scale * (beta_smbr[arc.nextstate] + frame_acc);
+      KALDI_VLOG(10) << "Beta SMBR for state " << s 
+                     << " going to state " << arc.nextstate
+                     << " at time " << state_times[s] << " is "
+                     << beta_smbr[s];
+
+      if (transition_id != 0) { // Arc has a transition-id on it [not epsilon]
+        double posterior = exp(alpha[s] + arc_beta - tot_forward_prob);
+        double acc_diff = alpha_smbr[s] + frame_acc + beta_smbr[arc.nextstate]
+                               - tot_forward_score;
+        double posterior_smbr = posterior * acc_diff;
+        (*post)[state_times[s]].push_back(std::make_pair(transition_id,
+                                                         static_cast<BaseFloat>(posterior_smbr)));
+      }
+    }
+  }
+  
+  //Second Pass Forward Backward check
+  double tot_backward_score = beta_smbr[0];  // Initial state id == 0
+  // may loose the condition somehow here 1e-5/1e-4
+  if (!ApproxEqual(tot_forward_score, tot_backward_score, 1e-4)) {
+    KALDI_ERR << "Total forward score over lattice = " << tot_forward_score
+              << ", while total backward score = " << tot_backward_score;
+  }
+
+  // Output the computed posteriors
+  for (int32 t = 0; t < max_time; t++)
+    MergePairVectorSumming(&((*post)[t]));
+
+  return tot_forward_score;
+}
+
+
+SignedLogDouble LatticeForwardBackwardNce(
+    const TransitionModel &trans,
+    const Lattice &lat,
+    Posterior *post,
+    const std::vector<BaseFloat> *weights,
+    BaseFloat weight_threshold) {
+  using namespace fst;
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  if (lat.Properties(fst::kTopSorted, true) == 0)
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+  
+  int32 num_states = lat.NumStates();
+  vector<int32> state_times;
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+
+  std::vector<SignedLogDouble> alpha_p(num_states),  // forward variable for p
+      alpha_r(num_states),                  // forward variable for -plog(p)
+      beta_p(num_states),                   // backward variable for p
+      beta_r(num_states);                   // backward variable for -plog(p)
+
+  SignedLogDouble Z;
+  SignedLogDouble r;
+
+  post->clear();
+  post->resize(max_time);
+
+  KALDI_ASSERT(lat.Start() == 0);   // For debugging
+
+  alpha_p[0].SetOne();
+  int32 final_states_count = 0;
+  // Forward Pass
+  for (StateId s = 0; s < num_states; s++) {
+    SignedLogDouble this_alpha_p(alpha_p[s]);
+    SignedLogDouble this_alpha_r(alpha_r[s]);
+    
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      SignedLogDouble p_a(false, -ConvertToCost(arc.weight));   // Initialize from log of real number
+      
+      // r_a = (p_a * -log_p_a);
+      SignedLogDouble r_a(-p_a.LogMagnitude());
+      r_a.Multiply(p_a);
+
+      // alpha_p[n[a]] += this_alpha_p * p_a
+      alpha_p[arc.nextstate].Add(this_alpha_p * p_a);
+      
+      // alpha_r[n[a]] += this_alpha_p * r_a + this_alpha_r * p_a
+      alpha_r[arc.nextstate].Add(this_alpha_p * r_a);
+      alpha_r[arc.nextstate].Add(this_alpha_r * p_a);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      final_states_count++;
+      SignedLogDouble f_p(false, -(f.Value1() + f.Value2()));
+      
+      // f_r = f_p * -log_f_p
+      SignedLogDouble f_r(-f_p.LogMagnitude());   // Initialize from a real number
+      f_r.Multiply(f_p);
+
+      Z.Add(this_alpha_p * f_p);
+      
+      r.Add(this_alpha_p * f_r);
+      r.Add(this_alpha_r * f_p);
+      
+      KALDI_ASSERT(state_times[s] == max_time && "Lattice is inconsistent (final-prob not at max_time");
+    }
+  }
+
+  // Special case check where the final state has weight One(). 
+  // This case is ensured by connecting all original final states to the "Final"
+  // state through arcs carrying their respective final weights and then
+  // add a "Final" weight of One() to the new state
+  // KALDI_ASSERT(final_states_count == 1); // Apparently not true
+
+  // Backward Pass
+  for (StateId s = num_states-1; s >= 0; s--) {
+    Weight f = lat.Final(s);
+    SignedLogDouble this_beta_p;
+    SignedLogDouble this_beta_r;
+
+    if (f != Weight::Zero()) {
+      KALDI_ASSERT(state_times[s] == max_time); // Special case
+
+      SignedLogDouble f_p(false, -(f.Value1() + f.Value2()));   // Initialize from log of real number
+      
+      // f_r = f_p * -log_f_p
+      SignedLogDouble f_r(-f_p.LogMagnitude());   // Initialize from real number
+      f_r.Multiply(f_p);
+
+      this_beta_p.Add(f_p);
+      this_beta_r.Add(f_r);
+    }
+
+    for (ArcIterator<Lattice> aiter(lat,s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      SignedLogDouble p_a(false, -ConvertToCost(arc.weight));   // Initialize from log of real number
+      
+      // log(p_a * -log_p_a);
+      SignedLogDouble r_a(-p_a.LogMagnitude());
+      r_a.Multiply(p_a);
+      
+      this_beta_p.Add(beta_p[arc.nextstate] * p_a);
+      this_beta_r.Add(beta_p[arc.nextstate] * r_a);
+      this_beta_r.Add(beta_r[arc.nextstate] * p_a);
+    }
+    beta_p[s] = this_beta_p;
+    beta_r[s] = this_beta_r;
+    
+    KALDI_VLOG(10) << "beta_p for state " << s << " is " << beta_p[s];
+    KALDI_VLOG(10) << "beta_r for state " << s << " is " << beta_r[s];
+
+  }
+
+  // Forward-Backward Check
+  KALDI_VLOG(10) << "Total forward probability over lattice = " << Z
+                  << ", while total backward probability = " << beta_p[0];
+  KALDI_VLOG(10) << "Total forward (-plog(p)) over lattice = " << r
+            << ", while total backward (-plog(p)) = " << beta_r[0];
+  if (!Z.ApproxEqual(beta_p[0], 1e-6)) {
+    KALDI_WARN << "Total forward probability over lattice = " << Z
+              << ", while total backward probability = " << beta_p[0];
+  }
+  if (!r.ApproxEqual(beta_r[0], 1e-6)) {
+    KALDI_WARN << "Total forward (-plog(p)) over lattice = " << r
+              << ", while total backward (-plog(p)) = " << beta_r[0];
+  }
+      
+  // Compute Entropy H = r/Z + log(Z)
+  SignedLogDouble H(r);
+  H.DivideBy(Z);
+
+  KALDI_ASSERT(Z.Positive());
+  H.AddReal(Z.LogMagnitude());
+
+  KALDI_VLOG(4) << "Entropy of Lattice is " << H;
+
+  // Derivative Computation
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    if (weights != NULL && (*weights)[t] < weight_threshold)
+      continue;
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      SignedLogDouble p_a(false, -ConvertToCost(arc.weight));
+      
+      // log(p_a * -log_p_a);
+      SignedLogDouble r_a(-p_a.LogMagnitude());
+      r_a.Multiply(p_a);
+
+      if (arc.ilabel != 0) {
+        //SignedLogDouble delH((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z);
+        //delH.Sub((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z * r / Z); 
+        //delH.Add((alpha_p[s] * beta_r[arc.nextstate] * p_a) / Z);
+        //delH.Add((alpha_r[s] * beta_p[arc.nextstate] * p_a) / Z);
+        //delH.Sub((alpha_p[s] * beta_p[arc.nextstate] * p_a) / Z);
+        //delH.Add((alpha_p[s] * beta_p[arc.nextstate] * r_a) / Z);
+        
+        SignedLogDouble delZ = alpha_p[s] * beta_p[arc.nextstate] * p_a;
+        SignedLogDouble delr = alpha_p[s] * beta_r[arc.nextstate] * p_a;
+        delr.Add(alpha_r[s] * beta_p[arc.nextstate] * p_a);
+        delr.Add(alpha_p[s] * beta_p[arc.nextstate] * r_a);
+        delr.Sub(alpha_p[s] * beta_p[arc.nextstate] * p_a);
+
+        SignedLogDouble delH = delZ / Z;
+        delH.Sub(delZ / Z * r / Z);
+        delH.Add(delr / Z);
+
+        // Push back delNce = -delH
+        (*post)[state_times[s]].push_back(std::make_pair(arc.ilabel, -delH.Value())); 
+      
+        /*
+           (1/Z - r/Z/Z) * alpha_p[s] * beta_p[arc.nextstate] * p_a
+           + 1/Z * (
+           alpha_p[s] * beta_r[arc.nextstate] * p_a 
+           + alpha_r[s] * beta_p[arc.nextstate] * p_a
+           - alpha_p[s] * beta_p[arc.nextstate] * p_a
+           + alpha_p[s] * beta_p[arc.nextstate] * r_a
+           )
+
+           -(1.0/Z - r/Z/Z) * Exp(alpha_p[s] + beta_p[arc.nextstate] + log_p_a) 
+           - (1.0/Z) * ( Exp(alpha_p[s] + beta_r[arc.nextstate] + log_p_a)
+           + Exp(alpha_r[s] + beta_p[arc.nextstate] + log_p_a)
+           - Exp(alpha_p[s] + beta_p[arc.nextstate] + log_p_a)
+           + Exp(alpha_p[s] + beta_p[arc.nextstate] + log_r_a) )));
+           */
+      }
+    }
+  }
+  return -H;    // Negative Conditional Entropy
+}
+
 bool CompactLatticeToWordAlignment(const CompactLattice &clat,
                                    std::vector<int32> *words,
                                    std::vector<int32> *begin_times,
@@ -969,6 +1419,7 @@ bool CompactLatticeToWordAlignment(const CompactLattice &clat,
 }
 
 
+
 bool CompactLatticeToWordProns(
     const TransitionModel &tmodel,
     const CompactLattice &clat,
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index 505aaffbe55..8aee928c6a7 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -4,6 +4,7 @@
 //           2012-2013   Johns Hopkins University (Author: Daniel Povey);
 //                       Bagher BabaAli
 //                2014   Guoguo Chen
+//                2014   Vimal Manohar
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -33,9 +34,13 @@
 #include "hmm/transition-model.h"
 #include "lat/kaldi-lattice.h"
 #include "itf/decodable-itf.h"
+#include "base/kaldi-types-extra.h"
 
 namespace kaldi {
 
+typedef SignedLogReal<double> SignedLogDouble;
+typedef SignedLogReal<BaseFloat> SignedLogBaseFloat;
+
 /// This function iterates over the states of a topologically sorted lattice and
 /// counts the time instance corresponding to each state. The times are returned
 /// in a vector of integers 'times' which is resized to have a size equal to the
@@ -61,7 +66,9 @@ int32 CompactLatticeStateTimes(const CompactLattice &clat,
 /// the objective function in MMI discriminative training.
 BaseFloat LatticeForwardBackward(const Lattice &lat,
                                  Posterior *arc_post,
-                                 double *acoustic_like_sum = NULL);
+                                 double *acoustic_like_sum = NULL,
+                                 std::vector<double> *out_alpha = NULL,
+                                 std::vector<double> *out_beta = NULL);
 
 // This function is something similar to LatticeForwardBackward(), but it is on
 // the CompactLattice lattice format. Also we only need the alpha in the forward 
@@ -74,6 +81,72 @@ bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
 bool ComputeCompactLatticeBetas(const CompactLattice &lat,
                                 vector<double> *beta);
 
+
+static inline double LogAddOrMax(bool viterbi, double a, double b) {
+  if (viterbi)
+    return std::max(a, b);
+  else
+    return LogAdd(a, b);
+}
+
+
+// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
+// best-path negated cost) Note: in either case, the alphas and betas are
+// negated costs.  Requires that lat be topologically sorted.  This code
+// will work for either CompactLattice or Latice.
+template<typename LatticeType>
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta) {
+  typedef typename LatticeType::Arc Arc;
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+
+  StateId num_states = lat.NumStates();
+  KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted);
+  KALDI_ASSERT(lat.Start() == 0);
+  alpha->resize(num_states, kLogZeroDouble);
+  beta->resize(num_states, kLogZeroDouble);
+
+  double tot_forward_prob = kLogZeroDouble;
+  (*alpha)[0] = 0.0;
+  // Propagate alphas forward.
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = (*alpha)[s];
+    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      (*alpha)[arc.nextstate] = LogAddOrMax(viterbi, (*alpha)[arc.nextstate],
+                                                this_alpha + arc_like);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - ConvertToCost(f);
+      tot_forward_prob = LogAddOrMax(viterbi, tot_forward_prob, final_like);
+    }
+  }
+  for (StateId s = num_states-1; s >= 0; s--) { // it's guaranteed signed.
+    double this_beta = -ConvertToCost(lat.Final(s));
+    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+          arc_beta = (*beta)[arc.nextstate] + arc_like;
+      this_beta = LogAddOrMax(viterbi, this_beta, arc_beta);
+    }
+    (*beta)[s] = this_beta;
+  }
+  double tot_backward_prob = (*beta)[lat.Start()];
+  if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) {
+    KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob
+               << ", while total backward probability = " << tot_backward_prob;
+  }
+  // Split the difference when returning... they should be the same.
+  return 0.5 * (tot_backward_prob + tot_forward_prob);
+}
+
 /// Topologically sort the compact lattice if not already topologically sorted.
 /// Will crash if the lattice cannot be topologically sorted.
 void TopSortCompactLatticeIfNeeded(CompactLattice *clat);
@@ -176,6 +249,34 @@ BaseFloat LatticeForwardBackwardMpeVariants(
     bool one_silence_class,
     Posterior *post);
 
+BaseFloat LatticeForwardBackwardEmpeVariants(
+    const TransitionModel &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    const Posterior *num_post,
+    const Lattice *num_lat,
+    std::string criterion,
+    bool one_silence_class,
+    BaseFloat deletion_penalty,
+    Posterior *post,
+    BaseFloat weight_threshold = 0.0,
+    const std::vector<BaseFloat> *weights = NULL);
+
+BaseFloat LatticeForwardBackwardEmpeVariantsInternal(
+    const TransitionModel &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    const Posterior &num_post,
+    const std::vector<double> &alpha,
+    const std::vector<double> &beta,
+    std::string criterion,
+    bool one_silence_class,
+    BaseFloat deletion_penalty,
+    Posterior *post, 
+    const std::vector<BaseFloat> *weights);
+
 /**
    This function can be used to compute posteriors for MMI, with a positive contribution
    for the numerator and a negative one for the denominator.  This function is not actually
@@ -198,6 +299,19 @@ BaseFloat LatticeForwardBackwardMmi(
     bool cancel,
     Posterior *arc_post);
 
+/**
+   This function can be used to compute the derivatives of NCE objective
+   function. This function is written for using in neural-net
+   semi-supervised discriminative training. 
+   It returns the objective function, which is the negative conditional
+   entropy of the lattice given the observation sequence. */
+SignedLogDouble LatticeForwardBackwardNce(
+    const TransitionModel &trans,
+    const Lattice &lat,
+    Posterior *arc_post,
+    const std::vector<BaseFloat> *weights = NULL,
+    BaseFloat weight_threshold = 0.0);
+
 
 /// This function takes a CompactLattice that should only contain a single
 /// linear sequence (e.g. derived from lattice-1best), and that should have been
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index f1633978fbf..ef2b2064b24 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -20,7 +20,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-minimize lattice-limit-depth lattice-depth-per-frame \
            lattice-confidence lattice-determinize-phone-pruned \
            lattice-determinize-phone-pruned-parallel lattice-expand-ngram \
-           lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons
+           lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
+					 lattice-determinize-non-compact
 
 OBJFILES =
 
diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc
index 76ca034b2e4..a97a75e7450 100644
--- a/src/latbin/lattice-copy.cc
+++ b/src/latbin/lattice-copy.cc
@@ -24,6 +24,108 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 
+namespace kaldi {
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialLatticeReader *lattice_reader,
+      LatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << "Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialCompactLatticeReader *lattice_reader,
+      CompactLatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+    
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << " Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -36,14 +138,29 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Copy lattices (e.g. useful for changing to text mode or changing\n"
         "format to standard from compact lattice.)\n"
+        "The --include and --exclude mutually exclusive options of this "
+        "program, which are intended to copy only subset of lattices.\n"
         "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n"
         "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n";
     
     ParseOptions po(usage);
-    bool write_compact = true;
+    bool write_compact = true, ignore_missing = false;
+    std::string include_rxfilename;
+    std::string exclude_rxfilename;
+
     po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
-    
+    po.Register("include", &include_rxfilename, 
+                        "Text file, the first field of each "
+                        "line being interpreted as an "
+                        "utterance-id whose features will be included");
+    po.Register("exclude", &exclude_rxfilename, 
+                        "Text file, the first field of each "
+                        "line being interpreted as an utterance-id"
+                        " whose features will be excluded");
+    po.Register("ignore-missing", &ignore_missing,
+                        "Exit with status 0 even if no lattices are copied");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -59,15 +176,46 @@ int main(int argc, char *argv[]) {
     if (write_compact) {
       SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
       CompactLatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,  
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename, 
+            &lattice_reader, &lattice_writer,
+            false, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     } else {
       SequentialLatticeReader lattice_reader(lats_rspecifier);
       LatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     }
     KALDI_LOG << "Done copying " << n_done << " lattices.";
+    
+    if (ignore_missing) return 0;
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
new file mode 100644
index 00000000000..f3f0acde892
--- /dev/null
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -0,0 +1,315 @@
+// latbin/lattice-determinize-non-compact.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "util/stl-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lat/push-lattice.h"
+#include "lat/minimize-lattice.h"
+
+#ifdef _MSC_VER
+#include <unordered_map>
+using std::unordered_map;
+#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
+#include <unordered_map>
+using std::unordered_map;
+#else
+#include <tr1/unordered_map>
+using std::tr1::unordered_map;
+#endif
+
+
+namespace kaldi {
+
+typedef Lattice::StateId StateId;
+typedef Lattice::Arc Arc;
+
+bool DeterminizeLatticeWrapper(const Lattice &lat,
+                               const std::string &key,
+                               bool prune,
+                               BaseFloat beam,
+                               BaseFloat beam_ratio,
+                               int32 max_mem,
+                               int32 max_loop,
+                               BaseFloat delta,
+                               int32 num_loops,
+                               CompactLattice *clat) {
+  fst::DeterminizeLatticeOptions lat_opts;
+  lat_opts.max_mem = max_mem;
+  lat_opts.max_loop = max_loop;
+  lat_opts.delta = delta;
+  BaseFloat cur_beam = beam;
+  for (int32 i = 0; i < num_loops;) { // we increment i below.
+
+    if (lat.Start() == fst::kNoStateId) {
+      KALDI_WARN << "Detected empty lattice, skipping " << key;
+      return false;
+    }
+    
+    // The work gets done in the next line.  
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+      if (prune) PruneLattice(cur_beam, clat);
+      return true;
+    } else { // failed to determinize..
+      KALDI_WARN << "Failed to determinize lattice (presumably max-states "
+                 << "reached), reducing lattice-beam to "
+                 << (cur_beam*beam_ratio) << " and re-trying.";
+      for (; i < num_loops; i++) {
+        cur_beam *= beam_ratio;
+        Lattice pruned_lat(lat);
+        PruneLattice(cur_beam, &pruned_lat);
+        if (NumArcs(lat) == NumArcs(pruned_lat)) {
+          cur_beam *= beam_ratio;
+          KALDI_WARN << "Pruning did not have an effect on the original "
+                     << "lattice size; reducing beam to "
+                     << cur_beam << " and re-trying.";
+        } else if (DeterminizeLattice(pruned_lat, clat, lat_opts, NULL)) {
+          if (prune) PruneLattice(cur_beam, clat);
+          return true;
+        } else {
+          KALDI_WARN << "Determinization failed again; reducing beam again to "
+                     << (cur_beam*beam_ratio) << " and re-trying.";
+        }
+      }
+    }
+  }
+  KALDI_WARN << "Decreased pruning beam --num-loops=" << num_loops
+             << " times and was not able to determinize: failed for "
+             << key;
+  return false;
+}
+
+void ComputeAcousticScoresMap(
+    const Lattice &lat, 
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, PairHasher<int32> > *acoustic_scores) {
+  acoustic_scores->clear();
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);
+  
+  KALDI_ASSERT(lat.Start() == 0);
+
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
+          aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      const LatticeWeight &weight = arc.weight;
+
+      int32 tid = arc.ilabel;
+
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
+        if (it == acoustic_scores->end()) {
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), std::make_pair(weight.Value2(), 1)));
+        } else {
+          if (it->second.second == 2 && it->second.first / it->second.second != weight.Value2()) {
+            KALDI_VLOG(2) << "Transitions on the same frame have different acoustic costs for tid "
+                          << tid << "; " 
+                          << it->second.first / it->second.second 
+                          << " vs " << weight.Value2();
+          }
+          it->second.first += weight.Value2();
+          it->second.second++;
+        }
+      } else {
+        // Arcs with epsilon input label (tid) must have 0 acoustic cost
+        KALDI_ASSERT(weight.Value2() == 0);
+      }
+    }
+
+    LatticeWeight f = lat.Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Final acoustic cost must be 0
+      KALDI_ASSERT(f.Value2() == 0.0);
+    }
+  }
+}
+
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, PairHasher<int32> > &acoustic_scores,
+    Lattice *lat) {
+  fst::TopSort(lat);
+  
+  std::vector<int32> state_times;
+  LatticeStateTimes(*lat, &state_times);
+  
+  KALDI_ASSERT(lat->Start() == 0);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s); 
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+ 
+      int32 tid = arc.ilabel;
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
+        if (it == acoustic_scores.end()) {
+          KALDI_ERR << "Could not find tid " << tid << " at time " << t
+                    << " in the acoustic scores map.";
+        } else {
+          arc.weight.SetValue2(it->second.first / it->second.second);
+        }
+      } else {
+        // For epsilon arcs, set acoustic cost to 0.0
+        arc.weight.SetValue2(0.0);
+      }
+      aiter.SetValue(arc);
+    }
+
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Set final acoustic cost to 0.0
+      f.SetValue2(0.0);
+      lat->SetFinal(s, f);
+    }
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "lattice-determinize lattices (and apply a pruning beam)\n"
+        " (see http://kaldi.sourceforge.net/lattices.html for more explanation)\n"
+        "This version of the program retains the original "
+        "acoustic scores of arcs in the lattice. "
+        " note: this program is tyically only useful if you generated state-level\n"
+        " lattices, e.g. called gmm-latgen-simple with --determinize=false\n"
+        "\n"
+        "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n"
+        " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
+      
+    ParseOptions po(usage);
+    BaseFloat acoustic_scale = 1.0;
+    BaseFloat beam = 10.0;
+    BaseFloat beam_ratio = 0.9;
+    int32 num_loops = 20;
+    int32 max_mem = 50000000; // 50 MB
+    int32 max_loop = 500000;
+    BaseFloat delta = fst::kDelta;
+    bool prune = false;
+    bool minimize = false;
+    
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("beam", &beam,
+                "Pruning beam [applied after acoustic scaling]-- also used "
+                "to handle determinization failures, set --prune=false to "
+                "disable routine pruning");
+    po.Register("delta", &delta, "Tolerance used in determinization");
+    po.Register("prune", &prune, "If true, prune determinized lattices "
+                "with the --beam option.");
+    po.Register("max-mem", &max_mem, "Maximum approximate memory usage in "
+                "determinization (real usage might be many times this)");
+    po.Register("max-loop", &max_loop, "Option to detect a certain "
+                "type of failure in lattice determinization (not critical)");
+    po.Register("beam-ratio", &beam_ratio, "Ratio by which to "
+                "decrease beam if we reach the max-arcs.");
+    po.Register("num-loops", &num_loops, "Number of times to "
+                "decrease beam by beam-ratio if determinization fails.");
+    po.Register("minimize", &minimize,
+                "If true, push and minimize after determinization");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier = po.GetArg(1),
+        lats_wspecifier = po.GetArg(2);
+
+
+    // Read as regular lattice-- this is the form we need it in for efficient
+    // pruning.
+    SequentialLatticeReader lattice_reader(lats_rspecifier);
+    
+    // Write as regular lattice.
+    LatticeWriter lattice_writer(lats_wspecifier); 
+
+    int32 n_done = 0, n_error = 0;
+
+    if (acoustic_scale == 0.0)
+      KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
+    LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
+
+    for (; !lattice_reader.Done(); lattice_reader.Next()) {
+      std::string key = lattice_reader.Key();
+      Lattice lat = lattice_reader.Value();
+      
+      lattice_reader.FreeCurrent();
+      
+      fst::TopSort(&lat);
+      
+      fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
+
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>, PairHasher<int32> > acoustic_scores;
+      ComputeAcousticScoresMap(lat, &acoustic_scores);
+      
+      Invert(&lat); // make it so word labels are on the input.
+      
+      CompactLattice clat;
+      if (DeterminizeLatticeWrapper(lat, key, prune,
+                                    beam, beam_ratio, max_mem, max_loop,
+                                    delta, num_loops, &clat)) {
+        if (minimize) {
+          PushCompactLatticeStrings(&clat);
+          PushCompactLatticeWeights(&clat);
+          MinimizeCompactLattice(&clat);
+        }
+
+        Lattice out_lat;
+        fst::ConvertLattice(clat, &out_lat);
+        fst::TopSort(&out_lat);
+
+        ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
+
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &out_lat);
+        lattice_writer.Write(key, out_lat);
+        n_done++;
+      } else {
+        n_error++; // will have already printed warning.
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/lm/Makefile b/src/lm/Makefile
index ddda9576557..acf327d994f 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -10,10 +10,10 @@ MATHLIB = NONE
 
 include ../kaldi.mk
 
-TESTFILES = lm-lib-test
+TESTFILES = arpa-file-parser-test lm-lib-test
 
-OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o kaldi-rnnlm.o \
-           mikolov-rnnlm-lib.o
+OBJFILES = arpa-file-parser.o const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o \
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
 
 TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst
 
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
new file mode 100644
index 00000000000..e37a916d263
--- /dev/null
+++ b/src/lm/arpa-file-parser-test.cc
@@ -0,0 +1,365 @@
+// lm/arpa-file-parser-test.cc
+
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file lm-lib-test.cc
+ * @brief Unit tests for language model code.
+ */
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+#include "lm/kaldi-lm.h"
+
+#include "lm/arpa-file-parser.h"
+
+namespace kaldi {
+namespace {
+
+const int kMaxOrder = 3;
+
+struct NGramTestData {
+  int32 line_number;
+  float logprob;
+  int32 words[kMaxOrder];
+  float backoff;
+};
+
+std::ostream& operator<<(std::ostream& os, const NGramTestData& data) {
+  std::ios::fmtflags saved_state(os.flags());
+  os << std::fixed << std::setprecision(6);
+
+  os << data.logprob << ' ';
+  for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' ';
+  os << data.backoff << " // Line " << data.line_number;
+
+  os.flags(saved_state);
+  return os;
+}
+
+// This does not own the array pointer, and uset to simplify passing expected
+// result to TestableArpaFileParser::Verify.
+template <class T>
+struct CountedArray {
+  template <size_t N>
+  CountedArray(T(&array)[N]) : array(array), count(N) { }
+  const T* array;
+  const size_t count;
+};
+
+template <class T, size_t N>
+inline CountedArray<T> MakeCountedArray(T(&array)[N]) {
+  return CountedArray<T>(array);
+}
+
+class TestableArpaFileParser : public ArpaFileParser {
+ public:
+  TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+      : ArpaFileParser(options, symbols),
+        header_available_(false),
+        read_complete_(false),
+        last_order_(0) { }
+  void Validate(CountedArray<int32> counts, CountedArray<NGramTestData> ngrams);
+
+ private:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
+  bool header_available_;
+  bool read_complete_;
+  int32 last_order_;
+  std::vector <NGramTestData> ngrams_;
+};
+
+void TestableArpaFileParser::HeaderAvailable() {
+  KALDI_ASSERT(!header_available_);
+  KALDI_ASSERT(!read_complete_);
+  header_available_ = true;
+  KALDI_ASSERT(NgramCounts().size() <= kMaxOrder);
+}
+
+void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  KALDI_ASSERT(ngram.words.size() <= NgramCounts().size());
+  KALDI_ASSERT(ngram.words.size() >= last_order_);
+  last_order_ = ngram.words.size();
+
+  NGramTestData entry = { 0 };
+  entry.line_number = LineNumber();
+  entry.logprob = ngram.logprob;
+  entry.backoff = ngram.backoff;
+  std::copy(ngram.words.begin(), ngram.words.end(), entry.words);
+  ngrams_.push_back(entry);
+}
+
+void TestableArpaFileParser::ReadComplete() {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  read_complete_ = true;
+}
+
+//
+bool CompareNgrams(const NGramTestData& actual,
+                   const NGramTestData& expected) {
+  if (actual.line_number != expected.line_number
+      || !std::equal(actual.words, actual.words + kMaxOrder,
+                     expected.words)
+      || !ApproxEqual(actual.logprob, expected.logprob)
+      || !ApproxEqual(actual.backoff, expected.backoff)) {
+    KALDI_WARN << "Actual n-gram [" << actual
+               << "] differs from expected [" << expected << "]";
+    return false;
+  }
+  return true;
+}
+
+void TestableArpaFileParser::Validate(
+    CountedArray<int32> expect_counts,
+    CountedArray<NGramTestData> expect_ngrams) {
+  // This needs better disagnostics probably.
+  KALDI_ASSERT(NgramCounts().size() == expect_counts.count);
+  KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(),
+                          expect_counts.array));
+
+  KALDI_ASSERT(ngrams_.size() == expect_ngrams.count);
+  // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(),
+  //                           expect_ngrams.array, CompareNgrams);
+  // if (mpos.first != ngrams_.end())
+  //   KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin();
+  //TODO:auto above requres C++11, and I cannot spell out the type!!!
+  KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(),
+                          expect_ngrams.array, CompareNgrams));
+}
+
+// Read integer LM (no symbols) with log base conversion.
+void ReadIntegerLmLogconvExpectSuccess() {
+  KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()";
+
+  static std::string integer_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.234679	4 -3.3\n\
+-3.456783	5\n\
+0.0000000	1 -2.5\n\
+-4.333333	2\n\
+\n\
+\\2-grams:\n\
+-1.45678	4 5 -3.23\n\
+-1.30490	1 4 -4.2\n\
+\n\
+\\3-grams:\n\
+-0.34958	1 4 5\n\
+-0.23940	4 5 2\n\
+\n\
+\\end\\";
+
+  int32 expect_counts[] = { 4, 2, 2 };
+  NGramTestData expect_ngrams[] = {
+    {  7, -12.05329, { 4, 0, 0 }, -7.598531 },
+    {  8, -7.959537, { 5, 0, 0 },  0.0      },
+    {  9,  0.0,      { 1, 0, 0 }, -5.756463 },
+    { 10, -9.977868, { 2, 0, 0 },  0.0      },
+
+    { 13, -3.354360, { 4, 5, 0 }, -7.437350 },
+    { 14, -3.004643, { 1, 4, 0 }, -9.670857 },
+
+    { 17, -0.804938, { 1, 4, 5 },  0.0      },
+    { 18, -0.551239, { 4, 5, 2 },  0.0      } };
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+
+  TestableArpaFileParser parser(options, NULL);
+  std::istringstream stm(integer_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_ngrams));
+}
+
+// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks.
+static std::string symbolic_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.2	a -3.3\n\
+-3.4	\xCE\xB2\n\
+0.0	<s> -2.5\n\
+-4.3	</s>\n\
+\n\
+\\2-grams:\n\
+-1.5	a \xCE\xB2 -3.2\n\
+-1.3	<s> a -4.2\n\
+\n\
+\\3-grams:\n\
+-0.3	<s> a \xCE\xB2\n\
+-0.2	<s> a </s>\n\
+\n\
+\\end\\";
+
+// Symbol table that is created with predefined test symbols, "a" but no "b".
+class TestSymbolTable : public fst::SymbolTable {
+ public:
+  TestSymbolTable() {
+    AddSymbol("<eps>", 0);
+    AddSymbol("<s>", 1);
+    AddSymbol("</s>", 2);
+    AddSymbol("<unk>", 3);
+    AddSymbol("a", 4);
+  }
+};
+
+// Full expected result shared between ReadSymbolicLmNoOovImpl and
+// ReadSymbolicLmWithOovAddToSymbols().
+NGramTestData expect_symbolic_full[] = {
+  {  7, -5.2, { 4, 0, 0 }, -3.3 },
+  {  8, -3.4, { 5, 0, 0 },  0.0 },
+  {  9,  0.0, { 1, 0, 0 }, -2.5 },
+  { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+  { 13, -1.5, { 4, 5, 0 }, -3.2 },
+  { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+  { 17, -0.3, { 1, 4, 5 },  0.0 },
+  { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  TestSymbolTable symbols;
+  symbols.AddSymbol("\xCE\xB2", 5);
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, &symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_symbolic_full));
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+}
+
+void ReadSymbolicLmNoOovTests() {
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram);
+}
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmWithOovImpl(
+    ArpaParseOptions::OovHandling oov,
+    CountedArray<NGramTestData> expect_ngrams,
+    fst::SymbolTable* symbols) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.use_log10 = true;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts), expect_ngrams);
+}
+
+void ReadSymbolicLmWithOovAddToSymbols() {
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols,
+                            MakeCountedArray(expect_symbolic_full),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+  KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5);
+}
+
+void ReadSymbolicLmWithOovReplaceWithUnk() {
+  NGramTestData expect_symbolic_unk_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  8, -3.4, { 3, 0, 0 },  0.0 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 13, -1.5, { 4, 3, 0 }, -3.2 },
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 17, -0.3, { 1, 4, 3 },  0.0 },
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk,
+                            MakeCountedArray(expect_symbolic_unk_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovSkipNGram() {
+  NGramTestData expect_symbolic_no_b[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 18, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram,
+                            MakeCountedArray(expect_symbolic_no_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovTests() {
+  KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()";
+  ReadSymbolicLmWithOovAddToSymbols();
+  KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()";
+  ReadSymbolicLmWithOovReplaceWithUnk();
+  KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()";
+  ReadSymbolicLmWithOovSkipNGram();
+}
+
+}  // namespace
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  kaldi::ReadIntegerLmLogconvExpectSuccess();
+  kaldi::ReadSymbolicLmNoOovTests();
+  kaldi::ReadSymbolicLmWithOovTests();
+}
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
new file mode 100644
index 00000000000..2d8f9f18638
--- /dev/null
+++ b/src/lm/arpa-file-parser.cc
@@ -0,0 +1,236 @@
+// lm/arpa-file-parser.cc
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include <fst/fstlib.h>
+
+#include "base/kaldi-error.h"
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+ArpaFileParser::ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols)
+    : options_(options), symbols_(symbols), line_number_(0) {
+}
+
+ArpaFileParser::~ArpaFileParser() {
+}
+
+void ArpaFileParser::Read(std::istream &is, bool binary) {
+  if (binary) {
+    KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
+  }
+
+  // Argument sanity checks.
+  if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 ||
+      options_.bos_symbol == options_.eos_symbol)
+    KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and "
+              << "differ from each other. Given:"
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL &&
+      options_.oov_handling == ArpaParseOptions::kReplaceWithUnk &&
+      (options_.unk_symbol <= 0 ||
+       options_.unk_symbol == options_.bos_symbol ||
+       options_.unk_symbol == options_.eos_symbol))
+    KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, "
+              << "UNK symbol is required, must not be epsilon, and "
+              << "differ from both BOS and EOS symbols. Given:"
+              << " UNK=" << options_.unk_symbol
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty())
+    KALDI_ERR << "BOS symbol must exist in symbol table";
+  if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty())
+    KALDI_ERR << "EOS symbol must exist in symbol table";
+  if (symbols_ != NULL && options_.unk_symbol > 0 &&
+      symbols_->Find(options_.unk_symbol).empty())
+    KALDI_ERR << "UNK symbol must exist in symbol table";
+
+  ngram_counts_.clear();
+  line_number_ = 0;
+
+#define PARSE_ERR (KALDI_ERR << "in line " << line_number_ << ": ")
+
+  // Give derived class an opportunity to prepare its state.
+  ReadStarted();
+
+  std::string line;
+
+  // Processes "\data\" section.
+  bool keyword_found = false;
+  while (++line_number_, getline(is, line) && !is.eof()) {
+    if (line.empty()) continue;
+
+    // The section keywords starts with backslash. We terminate the while loop
+    // if a new section is found.
+    if (line[0] == '\\') {
+      if (!keyword_found && line == "\\data\\") {
+        KALDI_LOG << "Reading \\data\\ section.";
+        keyword_found = true;
+        continue;
+      }
+      break;
+    }
+
+    if (!keyword_found) continue;
+
+    // Enters "\data\" section, and looks for patterns like "ngram 1=1000",
+    // which means there are 1000 unigrams.
+    std::size_t equal_symbol_pos = line.find("=");
+    if (equal_symbol_pos != std::string::npos)
+      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
+    std::vector<std::string> col;
+    SplitStringToVector(line, " \t", true, &col);
+    if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") {
+      int32 order, ngram_count = 0;
+      if (!ConvertStringToInteger(col[1], &order) ||
+          !ConvertStringToInteger(col[3], &ngram_count)) {
+        PARSE_ERR << "Cannot parse ngram count '" << line << "'.";
+      }
+      if (ngram_counts_.size() <= order) {
+        ngram_counts_.resize(order);
+      }
+      ngram_counts_[order - 1] = ngram_count;
+    } else {
+      KALDI_WARN << "Uninterpretable line in \\data\\ section: " << line;
+    }
+  }
+
+  if (ngram_counts_.size() == 0)
+    PARSE_ERR << "\\data\\ section missing or empty.";
+
+  // Signal that grammar order and n-gram counts are known.
+  HeaderAvailable();
+
+  NGram ngram;
+  ngram.words.reserve(ngram_counts_.size());
+
+  // Processes "\N-grams:" section.
+  for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) {
+    // Skips n-grams with zero count.
+    if (ngram_counts_[cur_order - 1] == 0) {
+      KALDI_WARN << "Zero ngram count in ngram order " << cur_order
+                 << "(look for 'ngram " << cur_order << "=0' in the \\data\\ "
+                 << " section). There is possibly a problem with the file.";
+      continue;
+    }
+
+    // Must be looking at a \k-grams: directive at this point.
+    std::ostringstream keyword;
+    keyword << "\\" << cur_order << "-grams:";
+    if (line != keyword.str()) {
+      PARSE_ERR << "Invalid directive '" << line << "', "
+                << "expecting '" << keyword.str() << "'.";
+    }
+    KALDI_LOG << "Reading " << line << " section.";
+
+    int32 ngram_count = 0;
+    while (++line_number_, getline(is, line) && !is.eof()) {
+      if (line.empty()) continue;
+      if (line[0] == '\\') break;
+
+      std::vector<std::string> col;
+      SplitStringToVector(line, " \t", true, &col);
+
+      if (col.size() < 1 + cur_order ||
+          col.size() > 2 + cur_order ||
+          (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) {
+        PARSE_ERR << "Invalid n-gram line '"  << line << "'";
+      }
+      ++ngram_count;
+
+      // Parse out n-gram logprob and, if present, backoff weight.
+      if (!ConvertStringToReal(col[0], &ngram.logprob)) {
+        PARSE_ERR << "Invalid n-gram logprob '" << col[0] << "'.";
+      }
+      ngram.backoff = 0.0;
+      if (col.size() > cur_order + 1) {
+        if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff))
+          PARSE_ERR << "Invalid backoff weight '" << col[cur_order + 1] << "'.";
+      }
+      // Convert to natural log unless the option is set not to.
+      if (!options_.use_log10) {
+        ngram.logprob *= M_LN10;
+        ngram.backoff *= M_LN10;
+      }
+
+      ngram.words.resize(cur_order);
+      bool skip_ngram = false;
+      for (int32 index = 0; !skip_ngram && index < cur_order; ++index) {
+        int32 word;
+        if (symbols_) {
+          // Symbol table provided, so symbol labels are expected.
+          if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) {
+            word = symbols_->AddSymbol(col[1 + index]);
+          } else {
+            word = symbols_->Find(col[1 + index]);
+            if (word == fst::SymbolTable::kNoSymbol) {
+              switch(options_.oov_handling) {
+                case ArpaParseOptions::kReplaceWithUnk:
+                  word = options_.unk_symbol;
+                  break;
+                case ArpaParseOptions::kSkipNGram:
+                  skip_ngram = true;
+                  break;
+                default:
+                  PARSE_ERR << "Word '"  << col[1 + index]
+                            << "' not in symbol table.";
+              }
+            }
+          }
+        } else {
+          // Symbols not provided, LM file should contain integers.
+          if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) {
+            PARSE_ERR << "invalid symbol '" << col[1 + index] << "'";
+          }
+        }
+        // Whichever way we got it, an epsilon is invalid.
+        if (word == 0) {
+          PARSE_ERR << "Epsilon symbol '" << col[1 + index]
+                    << "' is illegal in ARPA LM.";
+        }
+        ngram.words[index] = word;
+      }
+      if (!skip_ngram) {
+        ConsumeNGram(ngram);
+      }
+    }
+    if (ngram_count > ngram_counts_[cur_order - 1]) {
+      PARSE_ERR << "Header said there would be " << ngram_counts_[cur_order]
+                << " n-grams of order " << cur_order << ", but we saw "
+                << ngram_count;
+    }
+  }
+
+  if (line != "\\end\\") {
+    PARSE_ERR << "Invalid or unexpected directive line '" << line << "', "
+              << "expected \\end\\.";
+  }
+
+  ReadComplete();
+
+#undef PARSE_ERR
+}
+
+}  // namespace kaldi
diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h
new file mode 100644
index 00000000000..0011fb4ee21
--- /dev/null
+++ b/src/lm/arpa-file-parser.h
@@ -0,0 +1,125 @@
+// lm/arpa-file-parser.h
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_ARPA_FILE_PARSER_H_
+#define KALDI_LM_ARPA_FILE_PARSER_H_
+
+#include <string>
+#include <vector>
+
+#include <fst/fst-decl.h>
+
+#include "base/kaldi-types.h"
+
+namespace kaldi {
+
+/**
+  Options that control ArpaFileParser
+*/
+struct ArpaParseOptions {
+  enum OovHandling {
+    kRaiseError,     ///< Abort on OOV words
+    kAddToSymbols,   ///< Add novel words to the symbol table.
+    kReplaceWithUnk, ///< Replace OOV words with <unk>.
+    kSkipNGram       ///< Skip n-gram with OOV word and continue.
+  };
+
+  ArpaParseOptions()
+      : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1),
+        oov_handling(kRaiseError), use_log10(false) { }
+
+  int32 bos_symbol;  ///< Symbol for <s>, Required non-epsilon.
+  int32 eos_symbol;  ///< Symbol for </s>, Required non-epsilon.
+  int32 unk_symbol;  ///< Symbol for <unk>, Required for kReplaceWithUnk.
+  OovHandling oov_handling;  ///< How to handle OOV words in the file.
+  bool use_log10;    ///< Use log10 for prob and backoff weight, not ln.
+};
+
+/**
+   A parsed n-gram from ARPA LM file.
+*/
+struct NGram {
+  NGram() : logprob(0.0), backoff(0.0) { }
+  std::vector<int32> words;  ///< Symbols in LTR order.
+  float logprob;             ///< Log-prob of the n-gram.
+  float backoff;             ///< log-backoff weight of the n-gram.
+};
+
+/**
+    ArpaFileParser is an abstract base class for ARPA LM file conversion.
+
+    See ConstArpaLmBuilder for a usage example.
+*/
+class ArpaFileParser {
+ public:
+  /// Constructs the parser with the given options and optional symbol table.
+  /// If symbol table is provided, then the file should contain text n-grams,
+  /// and the words are mapped to symbols through it. bos_symbol and
+  /// eos_symbol in the options structure must be valid symbols in the table,
+  /// and so must be unk_symbol if provided. The table is not owned by the
+  /// parser, but may be augmented, if oov_handling is set to kAddToSymbols.
+  /// If symbol table is a null pointer, the file should contain integer
+  /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol
+  /// must be valid symbols still.
+  ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols);
+  virtual ~ArpaFileParser();
+
+  /// Read ARPA LM file through Kaldi I/O functions. Only text mode is
+  /// supported.
+  void Read(std::istream &is, bool binary);
+
+  const ArpaParseOptions& Options() const { return options_; }
+
+ protected:
+  /// Override called before reading starts. This is the point to prepare
+  /// any state in the derived class.
+  virtual void ReadStarted() { }
+
+  /// Override function called to signal that ARPA header with the expected
+  /// number of n-grams has been read, and ngram_counts() is now valid.
+  virtual void HeaderAvailable() { }
+
+  /// Pure override that must be implemented to process current n-gram. The
+  /// n-grams are sent in the file order, which guarantees that all
+  /// (k-1)-grams are processed before the first k-gram is.
+  virtual void ConsumeNGram(const NGram&) = 0;
+
+  /// Override function called after the last n-gram has been consumed.
+  virtual void ReadComplete() { }
+
+  /// Read-only access to symbol table.
+  const fst::SymbolTable* Symbols() const { return symbols_; }
+
+  /// Inside ConsumeNGram(), provides the current line number.
+  int32 LineNumber() const { return line_number_; }
+
+  /// N-gram counts. Valid in and after a call to HeaderAvailable().
+  const std::vector<int32>& NgramCounts() const { return ngram_counts_; }
+
+ private:
+  ArpaParseOptions options_;
+  fst::SymbolTable* symbols_;  // Not owned.
+  int32 line_number_;
+  std::vector<int32> ngram_counts_;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_ARPA_FILE_PARSER_H_
diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc
index 7f63dce886e..5043933d7f0 100644
--- a/src/lm/const-arpa-lm.cc
+++ b/src/lm/const-arpa-lm.cc
@@ -22,13 +22,14 @@
 #include <sstream>
 #include <utility>
 
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
 #include "lm/const-arpa-lm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
-#include "base/kaldi-math.h"
 
-namespace kaldi {
 
+namespace kaldi {
 
 // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
 // format.
@@ -173,13 +174,10 @@ class LmState {
 
 // Class to build ConstArpaLm from Arpa format language model. It relies on the
 // auxiliary class LmState above.
-class ConstArpaLmBuilder {
+class ConstArpaLmBuilder : public ArpaFileParser {
  public:
-  ConstArpaLmBuilder(
-      const bool natural_base, const int32 bos_symbol,
-      const int32 eos_symbol, const int32 unk_symbol) :
-      natural_base_(natural_base), bos_symbol_(bos_symbol),
-      eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) {
+  ConstArpaLmBuilder(ArpaParseOptions options)
+      : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
@@ -204,21 +202,21 @@ class ConstArpaLmBuilder {
     }
   }
 
-  // Reads in the Arpa format language model, parses it and creates LmStates.
-  void Read(std::istream &is, bool binary);
-
   // Writes ConstArpaLm.
   void Write(std::ostream &os, bool binary) const;
 
-  // Builds ConstArpaLm.
-  void Build();
-
   void SetMaxAddressOffset(const int32 max_address_offset) {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }
 
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
  private:
   struct WordsAndLmStatePairLessThan {
     bool operator()(
@@ -229,10 +227,6 @@ class ConstArpaLmBuilder {
   };
 
  private:
-  // If true, use natural base e for log-prob, otherwise use base 10. The
-  // default base in Arpa format language model is base 10.
-  bool natural_base_;
-
   // Indicating if ConstArpaLm has been built or not.
   bool is_built_;
 
@@ -240,16 +234,6 @@ class ConstArpaLmBuilder {
   // The default value is 30-bits and should not be changed except for testing.
   int32 max_address_offset_;
 
-  // Integer corresponds to <s>.
-  int32 bos_symbol_;
-
-  // Integer corresponds to </s>.
-  int32 eos_symbol_;
-
-  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
-  // provided.
-  int32 unk_symbol_;
-
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
@@ -280,201 +264,58 @@ class ConstArpaLmBuilder {
                 LmState*, VectorHasher<int32> > seq_to_state_;
 };
 
-// Reads in the Arpa format language model, parses it and puts the word sequence
-// into the corresponding LmState in <seq_to_state_>.
-void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
-  if (binary) {
-    KALDI_ERR << "binary-mode reading is not implemented for "
-        << "ConstArpaLmBuilder.";
-  }
-
-  std::string line;
-
-  // Number of n-grams from "\data\" section. Those numbers should match the
-  // actual number of n-grams from "\N-grams:" sections.
-  // Note that when we convert the words in the Arpa format language model into
-  // integers, we remove lines with OOV words. We also modify the n-gram counts
-  // in "\data\" correspondingly.
-  std::vector<int32> num_ngrams;
-
-  // Processes "\data\" section.
-  bool keyword_found = false;
-  while (getline(is, line) && !is.eof()) {
-    // The section keywords starts with backslash. We terminate the while loop
-    // if a new section is found.
-    if (!line.empty() && line[0] == '\\') {
-      if (line.find("-grams:") != std::string::npos) break;
-      if (line.find("\\end\\") != std::string::npos) break;
-    }
-
-    std::size_t equal_symbol_pos = line.find("=");
-    if (equal_symbol_pos != std::string::npos)
-      line.replace(equal_symbol_pos, 1, " = ");  // Inserts spaces around "="
-    std::vector<std::string> col;
-    SplitStringToVector(line, " \t", true, &col);
-
-    // Looks for keyword "\data\".
-    if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") {
-      KALDI_LOG << "Reading \"\\data\\\" section.";
-      keyword_found = true;
-      continue;
-    }
+void ConstArpaLmBuilder::HeaderAvailable() {
+  ngram_order_ = NgramCounts().size();
+}
 
-    // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which
-    // means there are 1000 unigrams.
-    if (keyword_found && col.size() == 4 && col[0] == "ngram") {
-      if (col[2] == "=") {
-        int32 order, ngram_count;
-        if (!ConvertStringToInteger(col[1], &order)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[1] << " to integer.";
-        }
-        if (!ConvertStringToInteger(col[3], &ngram_count)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[3] << " to integer.";
-        }
-        if (num_ngrams.size() <= order) {
-          num_ngrams.resize(order + 1);
-        }
-        num_ngrams[order] = ngram_count;
-      } else {
-        KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-      }
-    } else if (keyword_found) {
-      KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-    }
+void ConstArpaLmBuilder::ConsumeNGram(const NGram& ngram) {
+  int32 cur_order = ngram.words.size();
+  // If <ngram_order_> is larger than 1, then we do not create LmState for
+  // the final order entry. We only keep the log probability for it.
+  LmState *lm_state = NULL;
+  if (cur_order != ngram_order_ || ngram_order_ == 1) {
+    lm_state = new LmState(cur_order == 1,
+                           cur_order == ngram_order_ - 1,
+                           ngram.logprob, ngram.backoff);
+
+    KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end());
+    seq_to_state_[ngram.words] = lm_state;
   }
-  if (num_ngrams.size() == 0)
-    KALDI_ERR << "Fail to read \"\\data\\\" section.";
-  ngram_order_ = num_ngrams.size() - 1;
-
-  // Processes "\N-grams:" section.
-  int32 max_word_id = 0;
-  for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) {
-    // Skips n-grams with zero count.
-    if (num_ngrams[cur_order] == 0) continue;
-
-    keyword_found = false;
-    int32 ngram_count = 0;
-    std::ostringstream keyword;
-    keyword << "\\" << cur_order << "-grams:";
-    // We use "do ... while" loop since one line has already been read.
-    do {
-      // The section keywords starts with backslash. We terminate the while loop
-      // if a new section is found.
-      if (!line.empty() && line[0] == '\\') {
-        if (line.find("-grams:") != std::string::npos && keyword_found) break;
-        if (line.find("\\end\\") != std::string::npos) break;
-      }
 
-      std::vector<std::string> col;
-      SplitStringToVector(line, " \t", true, &col);
-
-      // Looks for keyword "\N-gram:" if the keyword has not been located.
-      if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) {
-        KALDI_LOG << "Reading \"" << keyword.str() << "\" section.";
-        ngram_count = 0;
-        keyword_found = true;
-        continue;
-      }
-
-      // Enters "\N-grams:" section if the keyword has been located.
-      if (keyword_found && col.size() > 0) {
-        KALDI_ASSERT(col.size() >= 1 + cur_order);
-        KALDI_ASSERT(col.size() <= 2 + cur_order);  // backoff_logprob can be 0.
-        if (cur_order == ngram_order_ && col.size() == 2 + cur_order) {
-          KALDI_ERR << "Backoff probability detected for final-order entry \""
-              << line << "\".";
-        }
-        ngram_count++;
-
-        // If backoff_logprob is 0, it will not appear in Arpa format language
-        // model. We put it back so the processing afterwards will be easier.
-        if (col.size() == 1 + cur_order) {
-          col.push_back("0");
-        }
-
-        // Creates LmState for the current word sequence.
-        bool is_unigram = (cur_order == 1) ? true : false;
-        float logprob;
-        float backoff_logprob;
-        KALDI_ASSERT(ConvertStringToReal(col[0], &logprob));
-        KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob));
-        if (natural_base_) {
-          logprob *= Log(10.0f);
-          backoff_logprob *= Log(10.0f);
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not create LmState for
-        // the final order entry. We only keep the log probability for it.
-        LmState *lm_state = NULL;
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          lm_state = new LmState(is_unigram,
-                                 (cur_order == ngram_order_ - 1),
-                                 logprob, backoff_logprob);
-        }
-
-        // Figures out the sequence of words.
-        std::vector<int32> seq(cur_order, 0);
-        for (int32 index = 0; index < cur_order; ++index) {
-          int32 word;
-          if (!ConvertStringToInteger(col[1 + index], &word)) {
-            KALDI_ERR << "bad line: " << line << "; fail to convert "
-                << col[1 + index] << " to integer.";
-          }
-          seq[index] = word;
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not insert LmState to
-        // <seq_to_state_>.
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          KALDI_ASSERT(lm_state != NULL);
-          KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end());
-          seq_to_state_[seq] = lm_state;
-        }
-
-        // If n-gram order is larger than 1, we have to add possible child to
-        // existing LmStates. We have the following two assumptions:
-        // 1. N-grams are processed from small order to larger ones, i.e., from
-        //    1, 2, ... to the highest order.
-        // 2. If a n-gram exists in the Arpa format language model, then the
-        //    "history" n-gram also exists. For example, if "A B C" is a valid
-        //    n-gram, then "A B" is also a valid n-gram.
-        if (cur_order > 1) {
-          std::vector<int32> hist(seq.begin(), seq.begin() + cur_order - 1);
-          int32 word = seq[seq.size() - 1];
-          unordered_map<std::vector<int32>,
-                        LmState*, VectorHasher<int32> >::iterator hist_iter;
-          hist_iter = seq_to_state_.find(hist);
-          KALDI_ASSERT(hist_iter != seq_to_state_.end());
-          if (cur_order != ngram_order_ || ngram_order_ == 1) {
-            KALDI_ASSERT(lm_state != NULL);
-            KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, lm_state);
-          } else {
-            KALDI_ASSERT(lm_state == NULL);
-            KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, logprob);
-          }
-        } else {
-          // Figures out <max_word_id>.
-          KALDI_ASSERT(seq.size() == 1);
-          if (seq[0] > max_word_id) {
-            max_word_id = seq[0];
-          }
-        }
-      }
-    } while (getline(is, line) && !is.eof());
-    if (ngram_count > num_ngrams[cur_order] ||
-        (ngram_count == 0 && num_ngrams[cur_order] != 0)) {
-      KALDI_ERR << "Header said there would be " << num_ngrams[cur_order]
-                << " n-grams of order " << cur_order << ", but we saw "
-                << ngram_count;
+  // If n-gram order is larger than 1, we have to add possible child to
+  // existing LmStates. We have the following two assumptions:
+  // 1. N-grams are processed from small order to larger ones, i.e., from
+  //    1, 2, ... to the highest order.
+  // 2. If a n-gram exists in the Arpa format language model, then the
+  //    "history" n-gram also exists. For example, if "A B C" is a valid
+  //    n-gram, then "A B" is also a valid n-gram.
+  int32 last_word = ngram.words[cur_order - 1];
+  if (cur_order > 1) {
+    std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
+    unordered_map<std::vector<int32>,
+                  LmState*, VectorHasher<int32> >::iterator hist_iter;
+    hist_iter = seq_to_state_.find(hist);
+    if (hist_iter == seq_to_state_.end()) {
+      std::ostringstream ss;
+      for (int i = 0; i < cur_order; ++i)
+        ss << (i == 0 ? '[' : ' ') << ngram.words[i];
+      KALDI_ERR << "In line " << LineNumber() << ": "
+                << cur_order << "-gram " << ss.str() << "] does not have "
+                << "a parent model " << cur_order << "-gram.";
+    }
+    if (cur_order != ngram_order_ || ngram_order_ == 1) {
+      KALDI_ASSERT(lm_state != NULL);
+      KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, lm_state);
+    } else {
+      KALDI_ASSERT(lm_state == NULL);
+      KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, ngram.logprob);
     }
+  } else {
+    // Figures out <max_word_id>.
+    num_words_ = std::max(num_words_, last_word + 1);
   }
-
-  // <num_words_> is <max_word_id> plus 1.
-  num_words_ = max_word_id + 1;
 }
 
 // ConstArpaLm can be built in the following steps, assuming we have already
@@ -503,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
 //    At the same time, we will also create two special buffers:
 //    <unigram_states_>
 //    <overflow_buffer_>
-void ConstArpaLmBuilder::Build() {
+void ConstArpaLmBuilder::ReadComplete() {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
@@ -637,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(is_built_);
 
   // Creates ConstArpaLm.
-  ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_,
-                            num_words_, overflow_buffer_size_, lm_states_size_,
-                            unigram_states_, overflow_buffer_, lm_states_);
+  ConstArpaLm const_arpa_lm(
+      Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
+      ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
+      unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }
 
@@ -1224,10 +1066,15 @@ bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol,
                       const int32 eos_symbol, const int32 unk_symbol,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename) {
-  ConstArpaLmBuilder lm_builder(natural_base, bos_symbol,
-                                eos_symbol, unk_symbol);
+  ArpaParseOptions options;
+  options.bos_symbol = bos_symbol;
+  options.eos_symbol = eos_symbol;
+  options.unk_symbol = unk_symbol;
+  options.use_log10 = !natural_base;
+
+  ConstArpaLmBuilder lm_builder(options);
+  KALDI_LOG << "Reading " << arpa_rxfilename;
   ReadKaldiObject(arpa_rxfilename, &lm_builder);
-  lm_builder.Build();
   WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
   return true;
 }
diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc
index e1fbcbdc08b..3a811c4c0e5 100644
--- a/src/lm/kaldi-rnnlm.cc
+++ b/src/lm/kaldi-rnnlm.cc
@@ -58,8 +58,8 @@ KaldiRnnlmWrapper::KaldiRnnlmWrapper(
 
 BaseFloat KaldiRnnlmWrapper::GetLogProb(
     int32 word, const std::vector<int32> &wseq,
-    const std::vector<BaseFloat> &context_in,
-    std::vector<BaseFloat> *context_out) {
+    const std::vector<float> &context_in,
+    std::vector<float> *context_out) {
 
   std::vector<std::string> wseq_symbols(wseq.size());
   for (int32 i = 0; i < wseq_symbols.size(); ++i) {
@@ -79,7 +79,7 @@ RnnlmDeterministicFst::RnnlmDeterministicFst(int32 max_ngram_order,
 
   // Uses empty history for <s>.
   std::vector<Label> bos;
-  std::vector<BaseFloat> bos_context(rnnlm->GetHiddenLayerSize(), 1.0f);
+  std::vector<float> bos_context(rnnlm->GetHiddenLayerSize(), 1.0);
   state_to_wseq_.push_back(bos);
   state_to_context_.push_back(bos_context);
   wseq_to_state_[bos] = 0;
@@ -101,7 +101,7 @@ bool RnnlmDeterministicFst::GetArc(StateId s, Label ilabel, fst::StdArc *oarc) {
   KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
 
   std::vector<Label> wseq = state_to_wseq_[s];
-  std::vector<BaseFloat> new_context(rnnlm_->GetHiddenLayerSize());
+  std::vector<float> new_context(rnnlm_->GetHiddenLayerSize());
   BaseFloat logprob = rnnlm_->GetLogProb(ilabel, wseq,
                                          state_to_context_[s], &new_context);
 
diff --git a/src/lm/kaldi-rnnlm.h b/src/lm/kaldi-rnnlm.h
index 5db1e7bc997..2383058a1a8 100644
--- a/src/lm/kaldi-rnnlm.h
+++ b/src/lm/kaldi-rnnlm.h
@@ -56,8 +56,8 @@ class KaldiRnnlmWrapper {
   int32 GetEos() const { return eos_; }
 
   BaseFloat GetLogProb(int32 word, const std::vector<int32> &wseq,
-                       const std::vector<BaseFloat> &context_in,
-                       std::vector<BaseFloat> *context_out);
+                       const std::vector<float> &context_in,
+                       std::vector<float> *context_out);
 
  private:
   rnnlm::CRnnLM rnnlm_;
@@ -96,7 +96,7 @@ class RnnlmDeterministicFst
 
   KaldiRnnlmWrapper *rnnlm_;
   int32 max_ngram_order_;
-  std::vector<std::vector<BaseFloat> > state_to_context_;
+  std::vector<std::vector<float> > state_to_context_;
 };
 
 }  // namespace kaldi
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 76b83ea7114..4b8056dbcdb 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -1969,6 +1969,17 @@ void MatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void MatrixBase<Real>::ApplySignum() {
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  for (MatrixIndexT i = 0; i < num_rows; i++) {
+    Real *data = this->RowData(i);
+    for (MatrixIndexT j = 0; j < num_cols; j++) {
+      if (data[j] > 0) data[j] = 1.0;
+      else if (data[j] < 0) data[j] = -1.0;
+    }
+  }
+}
 
 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index c16ffb22135..f038b1b7fa5 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -349,6 +349,10 @@ class MatrixBase {
   /// RectifiedLinearComponent in the neural net code.
   void ApplyHeaviside();
 
+  /// Applies the Signum function (1 if x > 0, 0 if x = 0 and -1 if x < 0)
+  /// to all matrix elements
+  void ApplySignum();
+  
   /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
   /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
   /// slightly complicated, due to the need for P to be real.  In the symmetric
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index 1b456501b5c..9c03738cf1c 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -3933,8 +3933,8 @@ void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
   }
   
   // apply all filters
-  AddMatMatBatched(1.0f, tgt_batch, patch_batch, kNoTrans, filter_params_batch,
-		  kTrans, 1.0f);
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans, filter_params_batch,
+		  kTrans, 1.0);
 
   // release memory
   delete filter_params_elem;
@@ -4060,8 +4060,8 @@ void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
 				    p * num_filters, num_filters)));
     filter_params_batch.push_back(filter_params_elem);  
   }
-  AddMatMatBatched(1.0f, patch_deriv_batch, out_deriv_batch, kNoTrans, 
-		  filter_params_batch, kNoTrans, 0.0f);
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans, 
+		  filter_params_batch, kNoTrans, 0.0);
 
   // release memory
   delete filter_params_elem;
@@ -4275,8 +4275,8 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
 				    p * filter_dim, filter_dim)));
   }
 
-  AddMatMatBatched(1.0f, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
-		  kNoTrans, 1.0f);
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
+		  kNoTrans, 1.0);
 
   // add the row blocks together to filters_grad
   filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
index 72a579d608f..3a389319dd8 100644
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ b/src/nnet2/nnet-compute-discriminative.cc
@@ -113,7 +113,31 @@ NnetDiscriminativeUpdater::NnetDiscriminativeUpdater(
               << opts_.silence_phones_str;
   }
   const Nnet &nnet = am_nnet_.GetNnet();
-  nnet.ComputeChunkInfo(eg_.input_frames.NumRows(), 1, &chunk_info_out_);
+  int32 dim = eg_.input_frames.NumCols();
+  if (dim != nnet.InputDim()) {
+    KALDI_ERR << "Feature dimension is " << dim << " but network expects "
+              << nnet.InputDim();
+  }
+  forward_data_.resize(nnet.NumComponents() + 1);
+
+  SubMatrix<BaseFloat> input_feats = GetInputFeatures();
+  
+  int32 num_rows = input_feats.NumRows();
+
+  nnet.ComputeChunkInfo(num_rows, 1, &chunk_info_out_);
+
+  int32 spk_dim = eg_.spk_info.Dim();
+  if (spk_dim == 0) {
+    forward_data_[0] = input_feats;
+  } else {
+    forward_data_[0].Resize(input_feats.NumRows(),
+                            input_feats.NumCols() + eg_.spk_info.Dim());
+    forward_data_[0].Range(0, input_feats.NumRows(),
+                           0, input_feats.NumCols()).CopyFromMat(input_feats);
+    forward_data_[0].Range(0, input_feats.NumRows(),
+                           input_feats.NumCols(), spk_dim).CopyRowsFromVec(
+                               eg_.spk_info);
+  }
 }
 
 
@@ -141,22 +165,7 @@ SubMatrix<BaseFloat> NnetDiscriminativeUpdater::GetInputFeatures() const {
 
 void NnetDiscriminativeUpdater::Propagate() {
   const Nnet &nnet = am_nnet_.GetNnet();
-  forward_data_.resize(nnet.NumComponents() + 1);
   
-  SubMatrix<BaseFloat> input_feats = GetInputFeatures();
-  int32 spk_dim = eg_.spk_info.Dim();
-  if (spk_dim == 0) {
-    forward_data_[0] = input_feats;
-  } else {
-    forward_data_[0].Resize(input_feats.NumRows(),
-                            input_feats.NumCols() + eg_.spk_info.Dim());
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           0, input_feats.NumCols()).CopyFromMat(input_feats);
-    forward_data_[0].Range(0, input_feats.NumRows(),
-                           input_feats.NumCols(), spk_dim).CopyRowsFromVec(
-                               eg_.spk_info);
-  }
-
   for (int32 c = 0; c < nnet.NumComponents(); c++) {
     const Component &component = nnet.GetComponent(c);
     CuMatrix<BaseFloat> &input = forward_data_[c],
diff --git a/src/nnet2/nnet-example-functions.cc b/src/nnet2/nnet-example-functions.cc
index 87184cd16e4..0c5ad4cad5e 100644
--- a/src/nnet2/nnet-example-functions.cc
+++ b/src/nnet2/nnet-example-functions.cc
@@ -992,6 +992,49 @@ void CombineDiscriminativeExamples(
 }
 
 
+bool PadDiscriminativeExamples(
+    int32 max_length, 
+    DiscriminativeNnetExample *eg) {
+  KALDI_ASSERT(max_length >= 0);
+
+  if (eg->num_ali.size() > max_length) {
+    return false;
+  }
+
+  int32 pad_frames = max_length - eg->num_ali.size();
+  
+  int32 nrows = eg->input_frames.NumRows();
+  int32 dim = eg->input_frames.NumCols();
+
+  Matrix<BaseFloat> feats(nrows + pad_frames, dim);
+  
+  feats.Range(0, pad_frames, 0, dim).CopyRowsFromVec(eg->input_frames.Row(0));
+  feats.Range(pad_frames, nrows, 0, dim).CopyFromMat(eg->input_frames);
+
+  eg->input_frames.Swap(&feats);
+
+  int32 arbitrary_tid = 1;
+  
+  CompactLattice inter_segment_clat;
+  int32 initial = inter_segment_clat.AddState(); // state 0.
+  inter_segment_clat.SetStart(initial);
+  
+  std::vector<int32> inter_segment_ali(pad_frames, arbitrary_tid);
+
+  CompactLatticeWeight final_weight = CompactLatticeWeight::One();
+  final_weight.SetString(inter_segment_ali);
+  inter_segment_clat.SetFinal(initial, final_weight);
+ 
+  fst::Concat(inter_segment_clat, &(eg->den_lat)); 
+  fst::TopSort(&(eg->den_lat));
+
+  eg->num_ali.insert(eg->num_ali.begin(), inter_segment_ali.begin(), inter_segment_ali.end());
+  
+  eg->Check();
+
+  return true;
+}
+
 
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2/nnet-example-functions.h b/src/nnet2/nnet-example-functions.h
index fac48b2f383..bb4b9a5d991 100644
--- a/src/nnet2/nnet-example-functions.h
+++ b/src/nnet2/nnet-example-functions.h
@@ -292,7 +292,8 @@ void UpdateHash(
     double *den_weight,
     double *tot_t);
 
-
+bool PadDiscriminativeExamples(int32 max_length,
+                               DiscriminativeNnetExample *eg);
 
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
index 193bf06c84a..ad889be5c55 100644
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ b/src/nnet2/nnet-precondition-online-test.cc
@@ -307,7 +307,7 @@ void UnitTestPreconditionDirectionsOnline() {
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual(row_prod1, row_prod2, 1.0e-02f);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet2bin/nnet-get-egs-discriminative.cc b/src/nnet2bin/nnet-get-egs-discriminative.cc
index 58db6972567..8e8c567aafd 100644
--- a/src/nnet2bin/nnet-get-egs-discriminative.cc
+++ b/src/nnet2bin/nnet-get-egs-discriminative.cc
@@ -46,7 +46,13 @@ int main(int argc, char *argv[]) {
     
     SplitDiscriminativeExampleConfig split_config;
     
+    int32 left_context = -1, right_context = -1;
+    
     ParseOptions po(usage);
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
     split_config.Register(&po);
     
     po.Read(argc, argv);
@@ -72,9 +78,20 @@ int main(int argc, char *argv[]) {
       am_nnet.Read(ki.Stream(), binary);
     }
 
-    int32 left_context = am_nnet.GetNnet().LeftContext(),
-        right_context = am_nnet.GetNnet().RightContext();
+    if (left_context >= 0) {
+      KALDI_ASSERT(left_context >= am_nnet.GetNnet().LeftContext());
+    } else {
+      left_context = am_nnet.GetNnet().LeftContext();
+    }
 
+    if (right_context >= 0) {
+      KALDI_ASSERT(right_context >= am_nnet.GetNnet().RightContext());
+    } else {
+      right_context = am_nnet.GetNnet().RightContext();
+    }
+    
+    KALDI_LOG << "left-context = " << left_context;
+    KALDI_LOG << "right-context = " << right_context;
     
     // Read in all the training files.
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
@@ -123,18 +140,29 @@ int main(int argc, char *argv[]) {
       
       KALDI_VLOG(2) << "Split lattice " << key << " into "
                     << egs.size() << " pieces.";
-      for (size_t i = 0; i < egs.size(); i++) {
-        // Note: excised_egs will be of size 0 or 1.
-        std::vector<DiscriminativeNnetExample> excised_egs;
-        ExciseDiscriminativeExample(split_config, trans_model, egs[i],
-                                    &excised_egs, &stats);
-        for (size_t j = 0; j < excised_egs.size(); j++) {
+
+      if (split_config.excise) {
+        for (size_t i = 0; i < egs.size(); i++) {
+          // Note: excised_egs will be of size 0 or 1.
+          std::vector<DiscriminativeNnetExample> excised_egs;
+          ExciseDiscriminativeExample(split_config, trans_model, egs[i],
+              &excised_egs, &stats);
+          for (size_t j = 0; j < excised_egs.size(); j++) {
+            std::ostringstream os;
+            os << (examples_count++);
+            std::string example_key = os.str();
+            example_writer.Write(example_key, excised_egs[j]);
+          }
+        }
+      } else {
+        for (size_t i = 0; i < egs.size(); i++) {
           std::ostringstream os;
           os << (examples_count++);
           std::string example_key = os.str();
-          example_writer.Write(example_key, excised_egs[j]);
+          example_writer.Write(example_key, egs[i]);
         }
       }
+
       num_done++;
     }
 
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 361d9a714a0..19d516c01a5 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -12,7 +12,8 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-compile-utils-test nnet-nnet-test nnet-utils-test \
   nnet-compile-test nnet-analyze-test nnet-compute-test \
   nnet-optimize-test nnet-derivative-test nnet-example-test \
-  nnet-common-test
+  nnet-common-test discriminative-supervision-test \
+	discriminative-training-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-simple-component.o \
@@ -24,11 +25,14 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example-utils.o nnet-training.o \
   nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
   nnet-optimize-utils.o nnet-chain-example.o \
-  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o
+  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+	discriminative-supervision.o nnet-discriminative-example.o \
+	nnet-discriminative-diagnostics.o \
+	discriminative-training.o nnet-discriminative-training.o
 
 LIBNAME = kaldi-nnet3
 
-ADDLIBS = ../chain/kaldi-chain.a ../thread/kaldi-thread.a \
+ADDLIBS = ../thread/kaldi-thread.a \
        ../lat/kaldi-lat.a ../gmm/kaldi-gmm.a \
       ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
       ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
diff --git a/src/nnet3/discriminative-supervision-test.cc b/src/nnet3/discriminative-supervision-test.cc
new file mode 100644
index 00000000000..46eed362775
--- /dev/null
+++ b/src/nnet3/discriminative-supervision-test.cc
@@ -0,0 +1,464 @@
+// nnet3bin/discriminative-get-supervision.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example-utils.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+namespace discriminative {
+
+void UnitTestSupervisionSplitter(const SplitDiscriminativeSupervisionOptions &splitter_config,
+                                 const DiscriminativeSupervision &supervision,
+                                 const std::vector<int32> &range_starts, int32 frames_per_eg,
+                                 std::vector<DiscriminativeSupervision*> *supervision_splits,
+                                 Lattice *splitter_lat) {
+  supervision_splits->clear();
+  DiscriminativeSupervisionSplitter splitter(splitter_config, supervision);
+
+  for (size_t i = 0; i < range_starts.size(); i++) {
+    int32 range_start = range_starts[i];
+
+    DiscriminativeSupervision* supervision_part = new DiscriminativeSupervision();
+
+    splitter.GetFrameRange(range_start,
+                           frames_per_eg,
+                           supervision_part);
+
+    supervision_splits->push_back(supervision_part);
+      
+    if (supervision.weights.size() > 0)
+      KALDI_ASSERT(supervision_part->weights.size() > 0);
+
+    for (size_t t = 0; t < frames_per_eg; t++) {
+      KALDI_ASSERT(supervision_part->num_ali[t] == supervision.num_ali[t + range_start]);
+      if (supervision.weights.size() > 0)
+        KALDI_ASSERT(supervision_part->weights[t] == supervision.weights[t + range_start]);
+      if (supervision.oracle_ali.size() > 0)
+        KALDI_ASSERT(supervision_part->oracle_ali[t] == supervision.oracle_ali[t + range_start]);
+    }
+  }
+
+  *splitter_lat = splitter.DenLat();
+}
+
+void UnitTestLatticeSplitPosteriors(const Lattice &lat, 
+                               const std::vector<const Lattice*> &lat_splits,
+                               const std::vector<int32> &range_starts,
+                               std::vector<Posterior> *post_splits) {
+  Posterior post;
+  double lat_ac_like;
+  std::vector<double> alpha;
+  std::vector<double> beta;
+  double lat_like = LatticeForwardBackward(lat, &post, &lat_ac_like, &alpha, &beta);
+  KALDI_LOG << "Lattice score is " << lat_like;
+
+  std::vector<double> alpha2;
+  std::vector<double> beta2;
+  ComputeLatticeAlphasAndBetas(lat, false, &alpha2, &beta2);
+
+  KALDI_ASSERT(alpha == alpha2 && beta == beta2);
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);
+
+  post_splits->clear();
+  post_splits->resize(lat_splits.size());
+
+  size_t s = 0, n = 0;
+  for (std::vector<const Lattice*>::const_iterator it = lat_splits.begin();
+        it != lat_splits.end(); ++it, s++) {
+    Posterior &post_part = (*post_splits)[s];
+    std::vector<double> alpha_part;
+    std::vector<double> beta_part;
+    
+    double lat_splits_ac_like = 0.0;
+    double lat_splits_like = LatticeForwardBackward(**it, &post_part, &lat_splits_ac_like, &alpha_part, &beta_part);
+
+    if (lat_splits_like > 1e-7) 
+      KALDI_WARN << "lat_splits_like = " << lat_splits_like << " is greater than 1e-7";
+    
+    n = std::lower_bound(state_times.begin(), state_times.end(), range_starts[s]) - state_times.begin();
+    if (s == 0) n--;
+    //for (size_t n_part = 1; n_part < alpha_part.size()-1; n_part++) {
+    //  KALDI_ASSERT(kaldi::ApproxEqual(alpha_part[n_part], alpha[n_part + n], .1));
+    //}
+
+    for (size_t i = 0; i < post_part.size(); i++) {
+      size_t t = i + range_starts[s];
+      KALDI_ASSERT(post_part[i].size() == post[t].size());
+      for (size_t j = 0; j < post_part[i].size(); j++) {
+        KALDI_ASSERT(post_part[i][j].first == post[t][j].first);
+        if (post_part[i][j].second < 1e-6 && post[t][j].second < 1e-6) continue;
+        //KALDI_ASSERT(kaldi::ApproxEqual(post_part[i][j].second, post[t][j].second, .1) || std::abs(post_part[i][j].second - post[t][j].second) < .1);
+      }
+    }
+  }
+  //KALDI_ASSERT(kaldi::ApproxEqual(lat_ac_like, lat_splits_ac_like));
+  //KALDI_ASSERT(kaldi::ApproxEqual(lat_like, lat_splits_like));
+}
+
+void UnitTestMmiPosteriors(
+    const TransitionModel &tmodel,
+    const std::vector<int32> &num_ali,
+    const Lattice &lat, 
+    const std::vector<const std::vector<int32>*> &ali_splits,
+    const std::vector<const Lattice*> &lat_splits, 
+    const std::vector<int32> range_starts,
+    std::vector<Posterior> *post_splits) {
+  Posterior post;
+  LatticeForwardBackwardMmi(tmodel, lat, num_ali, 
+                            false, false, false, &post);
+
+  size_t s = 0;
+  for (size_t i = 0; i < ali_splits.size(); i++, s++) {
+    Posterior post_part;
+    LatticeForwardBackwardMmi(tmodel, *(lat_splits[i]), 
+                              *(ali_splits[i]), false, false, false,
+                              &post_part);
+    post_splits->push_back(post_part);
+    
+    for (size_t i = 0; i < post_part.size(); i++) {
+      size_t t = i + range_starts[s];
+      KALDI_ASSERT(post_part[i].size() == post[t].size());
+      for (size_t j = 0; j < post_part[i].size(); j++) {
+        KALDI_ASSERT(post_part[i][j].first == post[t][j].first);
+        if (post_part[i][j].second < 1e-6 && post[t][j].second < 1e-6) continue;
+        KALDI_ASSERT(kaldi::ApproxEqual(post_part[i][j].second, post[t][j].second, .1) || std::abs(post_part[i][j].second - post[t][j].second) < .1);
+      }
+    }
+  }
+}
+
+void UnitTestMpePosteriors(
+    const TransitionModel &tmodel,
+    const std::vector<int32> &silence_phones,
+    std::string criterion,
+    bool one_silence_class,
+    const std::vector<int32> &num_ali,
+    const Lattice &lat, 
+    const std::vector<const std::vector<int32>*> &ali_splits,
+    const std::vector<const Lattice*> &lat_splits, 
+    const std::vector<int32> range_starts,
+    std::vector<Posterior> *post_splits) {
+  Posterior post;
+  
+  double acc;
+  try {
+    acc = LatticeForwardBackwardMpeVariants(tmodel, silence_phones, lat, num_ali, 
+                                      criterion, one_silence_class, &post);
+  } catch (std::exception &e) {
+    KALDI_LOG << e.what();
+    return;
+  }
+
+  size_t s = 0;
+  double splits_acc = 0;
+  int32 splits_count = 0;
+  for (size_t i = 0; i < ali_splits.size(); i++, s++) {
+    Posterior post_part;
+    try {
+      splits_acc += LatticeForwardBackwardMpeVariants(tmodel, silence_phones, *(lat_splits[i]), 
+          *(ali_splits[i]), criterion, one_silence_class,
+          &post_part);
+      splits_count += (ali_splits[i])->size();
+    } catch (std::exception &e) {
+      KALDI_LOG << e.what();
+      continue;
+    }
+    post_splits->push_back(post_part);
+    
+    for (size_t i = 0; i < post_part.size(); i++) {
+      size_t t = i + range_starts[s];
+
+      KALDI_ASSERT(post_part[i].size() == 0 || post[t].size() == 0 || post_part[i].size() == post[t].size());
+      for (size_t j = 0; j < post_part[i].size(); j++) {
+        KALDI_ASSERT(post_part[i][j].first == post[t][j].first);
+        if (i < 10 || i > post_part.size() - 10) continue;
+        if (post_part[i][j].second < 1e-6 && post[t][j].second < 1e-6) continue;
+        if(!(kaldi::ApproxEqual(post_part[i][j].second, post[t][j].second, .1) || std::abs(post_part[i][j].second - post[t][j].second) < .1)) {
+          KALDI_WARN << "MPE split post = " << post_part[i][j].second << " vs " << post[t][j].second;
+          if (i > 20 && i < post_part.size() - 20) 
+            KALDI_ASSERT(kaldi::ApproxEqual(post_part[i][j].second, post[t][j].second, .1) || std::abs(post_part[i][j].second - post[t][j].second) < .1);
+        }
+      }
+    }
+  }
+
+  if(!kaldi::ApproxEqual(acc / num_ali.size(), splits_acc / splits_count, 1e-2)) {
+    KALDI_ASSERT(kaldi::ApproxEqual(acc / num_ali.size(), splits_acc / splits_count, 1e-1));
+    KALDI_WARN << "acc = " << acc / num_ali.size() << ", while splits acc = " << splits_acc / splits_count;
+  }
+}
+
+void UnitTestSupervisionMerge(const DiscriminativeSupervision &supervision,
+                              const std::vector<const DiscriminativeSupervision*> &supervision_splits,
+                              const std::vector<int32> &range_starts,
+                              const std::vector<Posterior> &post_splits) {
+  std::vector<DiscriminativeSupervision> out_supervisions;
+  AppendSupervision(supervision_splits, true, &out_supervisions);
+  DiscriminativeSupervision &out_supervision = out_supervisions.back();
+
+  Posterior post, out_post;
+  LatticeForwardBackward(supervision.den_lat, &post, NULL);
+  LatticeForwardBackward(out_supervision.den_lat, &out_post, NULL);
+   
+  for (size_t s = 0; s < range_starts.size(); s++) {
+    for (size_t i = 0; i < out_supervision.frames_per_sequence; i++) {
+      size_t t = range_starts[s] + i, ot = s * out_supervision.frames_per_sequence + i;
+      KALDI_ASSERT(out_post[ot].size() == post[t].size());
+      for (size_t j = 0; j < out_post[ot].size(); j++) {
+        KALDI_ASSERT(out_post[ot][j].first == post[t][j].first);
+        if (out_post[ot][j].second < 1e-6 && post[t][j].second < 1e-6) continue;
+        //KALDI_ASSERT(kaldi::ApproxEqual(out_post[ot][j].second, post[t][j].second, .1) || std::abs(out_post[ot][j].second - post[t][j].second) < 1e-4);
+      }
+    }
+  }
+}
+
+void UnitTestSupervision(const SplitDiscriminativeSupervisionOptions &splitter_config,
+                         const TransitionModel &tmodel,
+                         DiscriminativeSupervision *supervision,
+                         int32 frames_per_eg) {
+  int32 num_frames = supervision->frames_per_sequence;
+  KALDI_ASSERT(supervision->num_sequences == 1);
+
+  std::vector<int32> range_starts;
+  nnet3::SplitIntoRanges(num_frames, frames_per_eg, &range_starts);
+
+  KALDI_ASSERT(!range_starts.empty());
+
+  std::vector<DiscriminativeSupervision*> supervision_splits;
+
+  Lattice splitter_lat;
+  UnitTestSupervisionSplitter(splitter_config, *supervision, range_starts, frames_per_eg, &supervision_splits, &splitter_lat);
+
+  std::vector<const Lattice*> lat_splits;
+  std::vector<const std::vector<int32>*> ali_splits;
+  for (std::vector<DiscriminativeSupervision*>::iterator it = supervision_splits.begin(); 
+        it != supervision_splits.end(); ++it) {
+    fst::ScaleLattice(fst::AcousticLatticeScale(splitter_config.supervision_config.acoustic_scale), &((*it)->den_lat));
+    lat_splits.push_back(const_cast<Lattice*>(&((*it)->den_lat)));
+    ali_splits.push_back(const_cast<std::vector<int32>*>(&((*it)->num_ali)));
+  }
+
+  supervision->den_lat = splitter_lat;
+  Lattice &den_lat = supervision->den_lat;
+ 
+  // Check that the posteriors for all the frames
+  // before and after splitting
+  std::vector<Posterior> post_splits;
+  UnitTestLatticeSplitPosteriors(den_lat, lat_splits, range_starts, &post_splits);
+
+  std::vector<Posterior> mmi_post_splits;
+  UnitTestMmiPosteriors(tmodel,
+                        supervision->num_ali, supervision->den_lat,
+                        ali_splits, lat_splits, 
+                        range_starts, &mmi_post_splits);
+ 
+  std::vector<int32> silence_phones;
+  for (int32 i = 1; i <= 15; i++) silence_phones.push_back(i);
+
+  std::string criterion = "smbr";
+  bool one_silence_class = true;
+  for (int32 i = 0; i < 4; i++) {
+    if (i == 1) one_silence_class = false;
+    if (i == 2) criterion = "mpfe";
+    if (i == 3) one_silence_class = true;
+
+    std::vector<Posterior> mpe_post_splits;
+    UnitTestMpePosteriors(tmodel, silence_phones, criterion, one_silence_class, 
+                          supervision->num_ali, supervision->den_lat,
+                          ali_splits, lat_splits, 
+                          range_starts, &mpe_post_splits);
+  }
+
+  std::vector<const DiscriminativeSupervision*> supervision_splits_const;
+  for (size_t i = 0; i < supervision_splits.size(); i++) {
+    supervision_splits_const.push_back(const_cast<DiscriminativeSupervision*> (supervision_splits[i]));
+  }
+
+  // Check again after merging
+  UnitTestSupervisionMerge(*supervision, supervision_splits_const, range_starts, post_splits);
+  
+  for (std::vector<DiscriminativeSupervision*>::iterator it = supervision_splits.begin(); it != supervision_splits.end(); ++it) {
+    delete *it;
+  }
+}
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::discriminative;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get a discriminative training supervision object for each file of training data\n"
+        " and test if splitting and merging works correctly\n"
+        "Input can come in two formats: \n"
+        "numerator alignments / denominator lattice pair \n"
+        ", or numerator and denominator lattice pair\n"
+        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
+        "<den-lattice-rspecifier>\n";
+
+    std::string num_lat_rspecifier;
+    std::string oracle_rspecifier;
+    std::string frame_weights_rspecifier;
+    int32 frames_per_eg = 150;
+
+    discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
+
+    ParseOptions po(usage);
+    po.Register("num-lat-rspecifier", &num_lat_rspecifier, "Get supervision "
+                "with numerator lattice");
+    po.Register("oracle-rspecifier", &oracle_rspecifier, "Add oracle "
+                "alignment to supervision");
+    po.Register("frame-weights-rspecifier", &frame_weights_rspecifier,
+                "Add frame weights to supervision");
+    po.Register("num-frames", &frames_per_eg, "Number of frames with labels "
+                "that each example contains.  Will be rounded up to a multiple "
+                "of --frame-subsampling-factor.");
+    
+    ParseOptions splitter_opts("supervision-splitter", &po);
+    splitter_config.Register(&splitter_opts);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_rxfilename = po.GetArg(1),
+                num_ali_rspecifier = po.GetArg(2),
+                den_lat_rspecifier = po.GetArg(3);
+
+    TransitionModel tmodel;
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+    }
+
+    RandomAccessCompactLatticeReader den_lat_reader(den_lat_rspecifier);
+    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
+
+    RandomAccessCompactLatticeReader num_lat_reader(num_lat_rspecifier);
+    RandomAccessInt32VectorReader oracle_reader(oracle_rspecifier);
+    RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
+
+    int32 num_utts_done = 0, num_utts_error = 0;
+
+    for (; !ali_reader.Done(); ali_reader.Next())  {
+      const std::string &key = ali_reader.Key();
+      const std::vector<int32> &num_ali = ali_reader.Value();
+      
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find denominator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      if (!num_lat_rspecifier.empty() && !num_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find numerator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+      
+      if (!oracle_rspecifier.empty() && !oracle_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find oracle alignment for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      if (!frame_weights_rspecifier.empty() && !frame_weights_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find frame weights for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      Vector<BaseFloat> frame_weights;
+      std::vector<int32> oracle_ali;
+      
+      if (!oracle_rspecifier.empty()) {
+        oracle_ali = oracle_reader.Value(key);
+      }
+
+      if (!frame_weights_rspecifier.empty()) {
+        frame_weights = frame_weights_reader.Value(key);
+      }
+
+      const CompactLattice &den_clat = den_lat_reader.Value(key);
+
+      DiscriminativeSupervision supervision;
+
+      if (!num_lat_rspecifier.empty()) {
+        const CompactLattice &num_clat = num_lat_reader.Value(key);
+        if (!LatticeToDiscriminativeSupervision(num_ali,
+            num_clat, den_clat, 1.0, &supervision, 
+            (!frame_weights_rspecifier.empty() ? &frame_weights : NULL), 
+            (!oracle_rspecifier.empty() ? &oracle_ali : NULL))) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_utts_error++;
+          continue;
+        }
+      } else {
+        if (!LatticeToDiscriminativeSupervision(num_ali,
+            den_clat, 1.0, &supervision,
+            (!frame_weights_rspecifier.empty() ? &frame_weights : NULL), 
+            (!oracle_rspecifier.empty() ? &oracle_ali : NULL))) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_utts_error++;
+          continue;
+        }
+      }
+
+      if (supervision.frames_per_sequence < frames_per_eg) continue;
+
+      UnitTestSupervision(splitter_config, tmodel, &supervision, frames_per_eg);
+
+      num_utts_done++;
+    } 
+    
+    KALDI_LOG << "Generated discriminative supervision information for "
+              << num_utts_done << " utterances, errors on "
+              << num_utts_error;
+    return (num_utts_done > num_utts_error ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
new file mode 100644
index 00000000000..cd7f1f4c0f0
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.cc
@@ -0,0 +1,687 @@
+// nnet3/discriminative-supervision.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+namespace discriminative {
+
+void DiscriminativeSupervisionOptions::Check() const {
+  KALDI_ASSERT(frame_subsampling_factor > 0);
+}
+
+DiscriminativeSupervision::DiscriminativeSupervision(const DiscriminativeSupervision &other):
+    weight(other.weight), num_sequences(other.num_sequences),
+    frames_per_sequence(other.frames_per_sequence), 
+    num_ali(other.num_ali), oracle_ali(other.oracle_ali),
+    weights(other.weights),
+    num_lat_present(other.num_lat_present),
+    num_lat(other.num_lat),
+    den_lat(other.den_lat) { }
+
+void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
+  std::swap(weight, other->weight);
+  std::swap(num_sequences, other->num_sequences);
+  std::swap(frames_per_sequence, other->frames_per_sequence);
+  std::swap(num_ali, other->num_ali);
+  std::swap(oracle_ali, other->oracle_ali);
+  std::swap(weights, other->weights);
+  std::swap(num_lat_present, other->num_lat_present);
+  std::swap(num_lat, other->num_lat);
+  std::swap(den_lat, other->den_lat);
+}
+
+bool DiscriminativeSupervision::operator == (const DiscriminativeSupervision &other) const {
+  return ( weight == other.weight && num_sequences == other.num_sequences &&
+      frames_per_sequence == other.frames_per_sequence &&
+      num_ali == other.num_ali &&
+      oracle_ali == other.oracle_ali &&
+      weights == other.weights &&
+      num_lat_present == other.num_lat_present &&
+      fst::Equal(num_lat, other.num_lat) && 
+      fst::Equal(den_lat, other.den_lat) );
+}
+
+void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<DiscriminativeSupervision>");
+  WriteToken(os, binary, "<Weight>");
+  WriteBasicType(os, binary, weight);
+  WriteToken(os, binary, "<NumSequences>");
+  WriteBasicType(os, binary, num_sequences);
+  WriteToken(os, binary, "<FramesPerSeq>");
+  WriteBasicType(os, binary, frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 &&
+               num_sequences > 0);
+  
+  WriteToken(os, binary, "<NumAli>");
+  WriteIntegerVector(os, binary, num_ali);
+
+  if (num_lat_present) {
+    WriteToken(os, binary, "<NumLat>");
+    if (!WriteLattice(os, binary, num_lat)) {
+      KALDI_ERR << "Error writing numerator lattice to stream";
+    }
+  } 
+
+  WriteToken(os, binary, "<OracleAli>");
+  WriteIntegerVector(os, binary, oracle_ali);
+
+  WriteToken(os, binary, "<FrameWeights>");
+  Vector<BaseFloat> frame_weights(weights.size());
+  for (size_t i = 0; i < weights.size(); i++) {
+    frame_weights(i) = weights[i];
+  }
+  frame_weights.Write(os, binary);
+
+  WriteToken(os, binary, "<DenLat>");
+  if (!WriteLattice(os, binary, den_lat)) {
+    // We can't return error status from this function so we
+    // throw an exception. 
+    KALDI_ERR << "Error writing denominator lattice to stream";
+  }
+
+  WriteToken(os, binary, "</DiscriminativeSupervision>");
+}
+
+void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<DiscriminativeSupervision>");
+  ExpectToken(is, binary, "<Weight>");
+  ReadBasicType(is, binary, &weight);
+  ExpectToken(is, binary, "<NumSequences>");
+  ReadBasicType(is, binary, &num_sequences);
+  ExpectToken(is, binary, "<FramesPerSeq>");
+  ReadBasicType(is, binary, &frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 && 
+               num_sequences > 0);
+  
+  ExpectToken(is, binary, "<NumAli>");
+  ReadIntegerVector(is, binary, &num_ali);
+
+  std::string token;
+  ReadToken(is, binary, &token);
+
+  if (token == "<NumLat>") {
+    num_lat_present = true;
+    Lattice *lat = NULL;
+    if (!ReadLattice(is, binary, &lat) || lat == NULL) {
+      // We can't return error status from this function so we
+      // throw an exception. 
+      KALDI_ERR << "Error reading Lattice from stream";
+    }
+    num_lat = *lat;
+    delete lat;
+    TopSort(&num_lat);
+    ReadToken(is, binary, &token);
+  } 
+ 
+  if (token != "<OracleAli>") {
+    KALDI_ERR << "Expecting token <OracleAli>; got token " << token;
+  }
+  
+  ReadIntegerVector(is, binary, &oracle_ali);
+
+  ExpectToken(is, binary, "<FrameWeights>");
+  Vector<BaseFloat> frame_weights;
+  frame_weights.Read(is, binary);
+  weights.clear();
+  std::copy(frame_weights.Data(), frame_weights.Data() + frame_weights.Dim(), std::back_inserter(weights));
+  
+  ExpectToken(is, binary, "<DenLat>");
+  {
+    Lattice *lat = NULL;
+    if (!ReadLattice(is, binary, &lat) || lat == NULL) {
+      // We can't return error status from this function so we
+      // throw an exception. 
+      KALDI_ERR << "Error reading Lattice from stream";
+    }
+    den_lat = *lat;
+    delete lat;
+    TopSort(&den_lat);
+  }
+
+  ExpectToken(is, binary, "</DiscriminativeSupervision>");
+}
+
+bool LatticeToDiscriminativeSupervision(const std::vector<int32> &num_ali,
+                                        const Lattice &num_lat, 
+                                        const Lattice &den_lat,
+                                        BaseFloat weight,
+                                        DiscriminativeSupervision *supervision,
+                                        const Vector<BaseFloat> *weights,
+                                        const std::vector<int32> *oracle_alignment) {
+  supervision->weight = weight;
+  supervision->num_sequences = 1;
+  supervision->frames_per_sequence = num_ali.size();
+  supervision->num_ali = num_ali;
+  supervision->num_lat_present = true;
+  supervision->num_lat = num_lat;
+  TopSort(&(supervision->num_lat));
+  supervision->den_lat = den_lat;
+  TopSort(&(supervision->den_lat));
+  if (weights) {
+    supervision->weights.clear();
+    std::copy(weights->Data(), weights->Data() + weights->Dim(), 
+              std::back_inserter(supervision->weights));
+  }
+  if (oracle_alignment)
+    supervision->oracle_ali = *oracle_alignment;
+
+  supervision->Check();
+
+  return true;
+}
+
+bool LatticeToDiscriminativeSupervision(const std::vector<int32> &num_ali,
+                                        const Lattice &den_lat, 
+                                        BaseFloat weight,
+                                        DiscriminativeSupervision *supervision,
+                                        const Vector<BaseFloat> *weights,
+                                        const std::vector<int32> *oracle_alignment) {
+  supervision->weight = weight;
+  supervision->num_sequences = 1;
+  supervision->frames_per_sequence = num_ali.size();
+  supervision->num_ali = num_ali;
+  supervision->num_lat_present = false;
+  supervision->den_lat = den_lat;
+  TopSort(&(supervision->den_lat));
+  if (weights) {
+    supervision->weights.clear();
+    std::copy(weights->Data(), weights->Data() + weights->Dim(), 
+        std::back_inserter(supervision->weights));
+  }
+  if (oracle_alignment)
+    supervision->oracle_ali = *oracle_alignment;
+
+  supervision->Check();
+
+  return true;
+}
+
+void DiscriminativeSupervision::Check() const {
+  int32 num_frames_subsampled = num_ali.size();
+
+  KALDI_ASSERT(oracle_ali.size() == 0 || 
+               static_cast<int32> (oracle_ali.size()) == num_frames_subsampled);
+  KALDI_ASSERT(weights.size() == 0 || 
+               static_cast<int32> (weights.size()) == num_frames_subsampled);
+  
+  {
+    std::vector<int32> state_times;
+    int32 max_time = LatticeStateTimes(den_lat, &state_times);
+    KALDI_ASSERT(max_time == num_frames_subsampled);
+  }
+
+  if (num_lat_present) {
+    std::vector<int32> state_times;
+    int32 max_time = LatticeStateTimes(num_lat, &state_times);
+    KALDI_ASSERT(max_time == num_frames_subsampled);
+  }
+}
+
+DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
+    const SplitDiscriminativeSupervisionOptions &config,
+    const TransitionModel &tmodel,
+    const DiscriminativeSupervision &supervision):
+    config_(config), tmodel_(tmodel), supervision_(supervision), 
+    num_lat_present_(supervision.num_lat_present) {
+  if (supervision_.num_sequences != 1) {
+    KALDI_WARN << "Splitting already-reattached sequence (only expected in "
+               << "testing code)";
+  }
+
+  KALDI_ASSERT(supervision_.num_sequences == 1); // For now, don't allow splitting already merged examples
+
+  // Prepare lattice : 
+  // 1) Order states in breadth-first search order
+  // 2) Compute states times, which must be a strictly non-decreasing vector
+  // 3) Compute lattice alpha and beta scores
+
+  den_lat_ = supervision_.den_lat;
+  PrepareLattice(&den_lat_, &den_lat_scores_);
+  
+  if (supervision_.num_lat_present) {
+    num_lat_ = supervision_.num_lat;
+    PrepareLattice(&num_lat_, &num_lat_scores_);
+  }
+  
+  int32 num_states = den_lat_.NumStates(),
+        num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  KALDI_ASSERT(num_states > 0);
+  int32 start_state = den_lat_.Start();
+  // Lattice should be top-sorted and connected, so start-state must be 0.
+  KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
+  
+  // The following asserts checks that the number of frames in the lattice 
+  // matches the num_frames stored in the supervision object; 
+  // it also relies on the breadth-first search sorting and connectedness
+  // of the FST.
+  if (num_lat_present_) {
+    KALDI_ASSERT(num_states == num_lat_scores_.state_times.size());
+    KALDI_ASSERT(num_lat_scores_.state_times[start_state] == 0);
+    KALDI_ASSERT(num_lat_scores_.state_times.back() == num_frames);
+  }
+
+  KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
+  KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
+  KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
+}
+
+// Make sure that for any given pdf-id and any given frame, the den-lat has
+// only one transition-id mapping to that pdf-id, on the same frame.
+// It helps us to more completely minimize the lattice.  Note: we
+// can't do this if the criterion is MPFE, because in that case the
+// objective function will be affected by the phone-identities being
+// different even if the pdf-ids are the same.
+void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
+    const std::vector<int32> &state_times, Lattice *lat) const {
+  typedef Lattice::StateId StateId;
+  typedef Lattice::Arc Arc;
+
+  int32 num_frames = state_times.back();   // TODO: Check if this is always true
+  StateId num_states = lat->NumStates();
+
+  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      KALDI_ASSERT(t >= 0 && t < num_frames);
+      Arc arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
+      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
+      if (pdf_to_tid[t].count(pdf) != 0) {
+        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
+        aiter.SetValue(arc);
+      } else {
+        pdf_to_tid[t][pdf] = arc.ilabel;
+      }
+    }
+  }    
+}
+
+void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
+  // Check if all the vectors are of size num_states
+  KALDI_ASSERT(state_times.size() == alpha_p.size() &&
+               state_times.size() == beta_p.size());
+
+  // Check that the states are ordered in increasing order of state_times
+  int32 t = 0;
+  for (std::vector<int32>::const_iterator it = state_times.begin();
+          it != state_times.end(); ++it) {
+    if (it == state_times.begin()) {
+      t = *it;
+      continue;
+    }
+    int32 cur_t = *it; 
+    KALDI_ASSERT(cur_t >= t);   
+    t = cur_t;
+  }
+} 
+
+void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize, 
+                                                      DiscriminativeSupervision *out_supervision) const {
+  int32 end_frame = begin_frame + num_frames;
+  // Note: end_frame is not included in the range of frames that the
+  // output supervision object covers; it's one past the end.
+  KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 &&
+               begin_frame + num_frames <=
+               supervision_.num_sequences * supervision_.frames_per_sequence);
+
+  CreateRangeLattice(den_lat_,
+                     den_lat_scores_,
+                     begin_frame, end_frame, normalize,
+                     &(out_supervision->den_lat));
+
+  if (num_lat_present_) {
+    CreateRangeLattice(num_lat_, 
+                       num_lat_scores_,
+                       begin_frame, end_frame, normalize,
+                       &(out_supervision->num_lat));
+  }
+  out_supervision->num_lat_present = num_lat_present_;
+
+  out_supervision->num_ali.clear();
+  std::copy(supervision_.num_ali.begin() + begin_frame,
+            supervision_.num_ali.begin() + end_frame,
+            std::back_inserter(out_supervision->num_ali));
+  
+  out_supervision->oracle_ali.clear();
+  if (supervision_.oracle_ali.size() > 0) {
+    std::copy(supervision_.oracle_ali.begin() + begin_frame,
+        supervision_.oracle_ali.begin() + end_frame,
+        std::back_inserter(out_supervision->oracle_ali));
+  }
+
+  out_supervision->weights.clear();
+  if (supervision_.weights.size() > 0) {
+    std::copy(supervision_.weights.begin() + begin_frame,
+        supervision_.weights.begin() + end_frame,
+        std::back_inserter(out_supervision->weights));
+  }
+
+  out_supervision->num_sequences = 1;
+  out_supervision->weight = supervision_.weight;
+  out_supervision->frames_per_sequence = num_frames;
+
+  out_supervision->Check();
+}
+
+void DiscriminativeSupervisionSplitter::CreateRangeLattice(
+    const Lattice &in_lat, const LatticeInfo &scores,
+    int32 begin_frame, int32 end_frame, bool normalize,
+    Lattice *out_lat) const {
+  typedef Lattice::StateId StateId;
+
+  const std::vector<int32> &state_times = scores.state_times;
+  
+  // Some checks to ensure the lattice and scores are prepared properly 
+  KALDI_ASSERT(state_times.size() == in_lat.NumStates());
+  if (!in_lat.Properties(fst::kTopSorted, true))
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+
+  std::vector<int32>::const_iterator begin_iter =
+      std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
+      end_iter = std::lower_bound(begin_iter, 
+                                  state_times.end(), end_frame);
+
+  KALDI_ASSERT(*begin_iter == begin_frame &&
+               (begin_iter == state_times.begin() || 
+                begin_iter[-1] < begin_frame));
+  // even if end_frame == supervision_.num_frames, there should be a state with
+  // that frame index.
+  KALDI_ASSERT(end_iter[-1] < end_frame &&
+               (end_iter < state_times.end() || *end_iter == end_frame));
+  StateId begin_state = begin_iter - state_times.begin(),
+          end_state = end_iter - state_times.begin();
+
+  KALDI_ASSERT(end_state > begin_state);
+  out_lat->DeleteStates();
+  out_lat->ReserveStates(end_state - begin_state + 2);
+
+  // Add special start state
+  StateId start_state = out_lat->AddState();
+  out_lat->SetStart(start_state);
+  
+  for (StateId i = begin_state; i < end_state; i++)
+    out_lat->AddState();
+  
+  // Add the special final-state.
+  StateId final_state = out_lat->AddState();
+  out_lat->SetFinal(final_state, LatticeWeight::One());
+
+  for (StateId state = begin_state; state < end_state; state++) {
+    StateId output_state = state - begin_state + 1;
+    if (state_times[state] == begin_frame) {
+      // we'd like to make this an initial state, but OpenFst doesn't allow
+      // multiple initial states.  Instead we add an epsilon transition to it
+      // from our actual initial state.  The weight on this 
+      // transition is the forward probability of the said 'initial state'
+      LatticeWeight weight = LatticeWeight::One();
+      weight.SetValue1((normalize ? scores.beta_p[0] : 0.0) - scores.alpha_p[state]); 
+      // Add negative of the forward log-probability to the graph cost score,
+      // since the acoustic scores would be changed later.
+      // Assuming that the lattice is scaled with appropriate acoustic
+      // scale.
+      // We additionally normalize using the total lattice score. Since the
+      // same score is added as normalizer to all the paths in the lattice,
+      // the relative probabilities of the paths in the lattice is not affected.
+      // Note: Doing a forward-backward on this split must result in a total
+      // score of 0 because of the normalization.
+
+      out_lat->AddArc(start_state, 
+                      LatticeArc(0, 0, weight, output_state));
+    } else {
+      KALDI_ASSERT(scores.state_times[state] < end_frame);
+    }
+    for (fst::ArcIterator<Lattice> aiter(in_lat, state); 
+          !aiter.Done(); aiter.Next()) {
+      const LatticeArc &arc = aiter.Value();
+      StateId nextstate = arc.nextstate;
+      if (nextstate >= end_state) {
+        // A transition to any state outside the range becomes a transition to
+        // our special final-state. 
+        // The weight is just the negative of the backward log-probability + 
+        // the arc cost. We again normalize with the total lattice score.
+        LatticeWeight weight;
+        //KALDI_ASSERT(scores.beta_p[state] < 0);
+        weight.SetValue1(arc.weight.Value1() - scores.beta_p[nextstate]); 
+        weight.SetValue2(arc.weight.Value2());
+        // Add negative of the backward log-probability to the LM score, since
+        // the acoustic scores would be changed later.
+        // Note: We don't normalize here because that is already done with the
+        // initial cost.
+      
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
+      } else {
+        StateId output_nextstate = nextstate - begin_state + 1;
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate));
+      }
+    }
+  }
+
+  // Get rid of the word labels and put the
+  // transition-ids on both sides.
+  fst::Project(out_lat, fst::PROJECT_INPUT);
+  fst::RmEpsilon(out_lat);
+
+  if (config_.collapse_transition_ids)
+    CollapseTransitionIds(state_times, out_lat);
+
+  if (config_.determinize) {
+    if (!config_.minimize) {
+      Lattice tmp_lat;
+      fst::Determinize(*out_lat, &tmp_lat);
+      std::swap(*out_lat, tmp_lat);
+    } else {
+      Lattice tmp_lat;
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::RmEpsilon(out_lat);
+    }
+  }
+
+  fst::TopSort(out_lat);    
+  std::vector<int32> state_times_tmp;
+  KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) == end_frame - begin_frame);
+
+  // // Check if alpha scores before and after splitting are the same
+  //LatticeInfo out_scores;
+  //ComputeLatticeScores(*out_lat, &out_scores);
+  //  KALDI_ASSERT(kaldi::ApproxEqual(out_scores.alpha_p[0], scores.alpha_p[begin_state - 1], .1));
+  //  KALDI_ASSERT(kaldi::ApproxEqual(out_scores.beta_p[0], scores.beta_p[begin_state - 1] - scores.beta_p[0], .1));
+  //for (size_t n_part = 1; n_part < out_scores.alpha_p.size()-1; n_part++) {
+  //  KALDI_ASSERT(kaldi::ApproxEqual(out_scores.alpha_p[n_part], scores.alpha_p[n_part + begin_state - 1] - scores.beta_p[0], .1));
+  //  KALDI_ASSERT(kaldi::ApproxEqual(out_scores.beta_p[n_part], scores.beta_p[n_part + begin_state - 1], .1));
+  //}
+
+  // Remove the acoustic scale that was previously added
+  if (config_.supervision_config.acoustic_scale != 1.0) {
+    fst::ScaleLattice(fst::AcousticLatticeScale(1 / config_.supervision_config.acoustic_scale), out_lat);
+  }
+}
+
+void DiscriminativeSupervisionSplitter::PrepareLattice(
+    Lattice *lat, LatticeInfo *scores) const {
+  // Scale the lattice to appropriate acoustic scale. It is important to 
+  // ensure this is equal to the acoustic scale used while training. This is 
+  // because, on splitting lattices, the initial and final costs are added 
+  // into the graph cost.
+  KALDI_ASSERT(config_.supervision_config.acoustic_scale != 0.0);
+  if (config_.supervision_config.acoustic_scale != 1.0)
+    fst::ScaleLattice(fst::AcousticLatticeScale(config_.supervision_config.acoustic_scale), lat);
+
+  LatticeStateTimes(*lat, &(scores->state_times));
+  int32 num_states = lat->NumStates();
+  std::vector<int32> inv_state_order(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    inv_state_order[s] = s;
+  }
+
+  // Order the states based on the state times. This is stronger than just
+  // topological sort. This is required by the lattice splitting code.
+  std::stable_sort(inv_state_order.begin(), 
+                   inv_state_order.end(), 
+                   OtherStlVectorComparator<int32>(scores->state_times));
+  
+  std::vector<int32> state_order(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    state_order[inv_state_order[s]] = s;
+  }
+
+  fst::StateSort(lat, state_order);
+  ComputeLatticeScores(*lat, scores);
+}
+
+void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
+    LatticeInfo *scores) const {
+  LatticeStateTimes(lat, &(scores->state_times));
+  ComputeLatticeAlphasAndBetas(lat, false, &(scores->alpha_p), &(scores->beta_p));
+  scores->Check();  // This check will fail if the lattice is not breadth-first search sorted
+}
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+                       bool compactify,
+                       std::vector<DiscriminativeSupervision> *output_supervision) {
+  KALDI_ASSERT(!input.empty());
+  int32 num_inputs = input.size();
+  if (num_inputs == 1) {
+    output_supervision->resize(1);
+    (*output_supervision)[0] = *(input[0]);
+    return;
+  }
+  output_supervision->clear();
+  output_supervision->reserve(input.size());
+  for (int32 i = 0; i < input.size(); i++) {
+    const DiscriminativeSupervision &src = *(input[i]);
+    KALDI_ASSERT(src.num_sequences == 1);
+    if (compactify && !output_supervision->empty() &&
+        output_supervision->back().weight == src.weight &&
+        output_supervision->back().frames_per_sequence ==
+        src.frames_per_sequence) {
+      // Combine with current output
+      // append src.den_lat to output_supervision->den_lat.
+      AppendLattice(&output_supervision->back().den_lat, src.den_lat);
+      if (i > 0) 
+        KALDI_ASSERT((*output_supervision)[0].num_lat_present == src.num_lat_present);
+      else
+        output_supervision->back().num_lat_present = src.num_lat_present;
+      if (src.num_lat_present)
+        AppendLattice(&output_supervision->back().num_lat, src.num_lat);
+
+      output_supervision->back().num_ali.insert(output_supervision->back().num_ali.end(), src.num_ali.begin(), src.num_ali.end());
+      if (output_supervision->back().oracle_ali.size() > 0)
+        output_supervision->back().oracle_ali.insert(output_supervision->back().oracle_ali.end(), src.oracle_ali.begin(), src.oracle_ali.end());
+      if (output_supervision->back().weights.size() > 0)
+        output_supervision->back().weights.insert(output_supervision->back().weights.end(), src.weights.begin(), src.weights.end());
+      output_supervision->back().num_sequences++;
+    } else {
+      output_supervision->resize(output_supervision->size() + 1);
+      output_supervision->back() = src;
+    }
+
+    output_supervision->back().Check();
+  }
+}
+
+void AppendLattice(Lattice *lat, const Lattice &src_lat) {
+  typedef Lattice::Arc Arc;
+  typedef Arc::StateId StateId;
+
+  std::vector<int32> state_times;
+  int32 num_frames = LatticeStateTimes(*lat, &state_times);
+  
+  std::vector<int32> state_times_src;
+  int32 num_frames_src = LatticeStateTimes(src_lat, &state_times_src);
+
+  //Lattice check_lat= *lat;
+  fst::Concat(lat, src_lat);
+  fst::TopSort(lat);
+
+  return;
+
+  int32 num_states_orig = lat->NumStates();
+  int32 num_states = num_states_orig;
+  
+  StateId src_start_state = src_lat.Start();
+  KALDI_ASSERT(src_start_state == 0);
+
+  lat->AddState(); num_states++;
+
+  for (StateId s = 0; s < num_states_orig; s++) {
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      KALDI_ASSERT(state_times[s] == num_frames);
+      lat->AddArc(s, Arc(0, 0, f, num_states_orig));
+      lat->SetFinal(s, LatticeWeight::Zero());
+    }
+  }
+  
+  for (StateId s = 0; s < src_lat.NumStates(); s++) {
+    if (s != src_start_state) {
+      lat->AddState();
+      num_states++;
+    }
+    StateId state_id = num_states_orig + s;
+    KALDI_ASSERT(state_id == num_states - 1 && num_states == lat->NumStates());
+
+    for (fst::ArcIterator<Lattice> aiter(src_lat, s); 
+          !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+      arc.nextstate += num_states_orig;
+      lat->AddArc(state_id, arc);
+    }
+    
+    LatticeWeight final_weight = src_lat.Final(s);
+    lat->SetFinal(state_id, final_weight);
+  }
+
+  KALDI_ASSERT(lat->NumStates() == num_states);
+  KALDI_ASSERT(num_states == num_states_orig + src_lat.NumStates());
+  
+
+  uint64 props = lat->Properties(fst::kTopSorted, true);
+  lat->SetProperties(props, fst::kTopSorted);
+  
+  std::vector<int32> state_times_out;
+  int32 num_frames_out = LatticeStateTimes(*lat, &state_times_out);
+  KALDI_ASSERT(num_frames_out == num_frames + num_frames_src);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      KALDI_ASSERT(state_times_out[s] == num_frames_out &&
+                   "Lattice is inconsistent (final-prob not at max_time)");
+    }
+    for (fst::ArcIterator<Lattice> aiter(*lat, s);
+        !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0)
+        KALDI_ASSERT(state_times_out[arc.nextstate] == state_times_out[s] + 1);
+    }
+  }
+}
+
+} // namespace discriminative 
+} // namespace kaldi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
new file mode 100644
index 00000000000..609c5c384ee
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.h
@@ -0,0 +1,267 @@
+// nnet3/discriminative-supervision.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+#define KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+
+#include "util/table-types.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+namespace discriminative {
+
+struct DiscriminativeSupervisionOptions {
+  int32 frame_subsampling_factor;
+  BaseFloat acoustic_scale;
+
+  DiscriminativeSupervisionOptions(): frame_subsampling_factor(1), acoustic_scale(0.1) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                   "if the frame-rate for the chain model will be less than the "
+                   "frame-rate of the original alignment.  Applied after "
+                   "left-tolerance and right-tolerance are applied (so they are "
+                   "in terms of the original num-frames.");
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods");
+  }
+
+  void Check() const;
+};
+
+struct SplitDiscriminativeSupervisionOptions {
+  bool remove_output_symbols;
+  bool collapse_transition_ids;
+  bool remove_epsilons;
+  bool determinize;
+  bool minimize; // we'll push and minimize if this is true.
+  DiscriminativeSupervisionOptions supervision_config;
+  
+  SplitDiscriminativeSupervisionOptions() :
+    remove_output_symbols(false), collapse_transition_ids(false), 
+    remove_epsilons(false), determinize(false),
+    minimize(false) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("collapse-transition-ids", &collapse_transition_ids,
+                   "Collapse transition ids");
+    opts->Register("remove-output-symbols", &remove_output_symbols,
+                   "Remove output symbols from lattice to convert it to an "
+                   "acceptor and make it more determinizable");
+    opts->Register("remove-epsilons", &remove_epsilons,
+                   "Remove epsilons");
+    opts->Register("determinize", &determinize, "If true, we determinize "
+                   "lattices (as Lattice) before splitting and possibly minimize");
+    opts->Register("minimize", &minimize, "If true, we push and "
+                   "minimize lattices (as Lattice) before splitting");
+    supervision_config.Register(opts);
+  }
+};
+
+/*
+  This file contains some declarations relating to the object we use to
+  encode the supervision information for sequence training
+*/
+
+// struct DiscriminativeSupervision is the fully-processed information for
+// a whole utterance or (after splitting) part of an utterance. 
+struct DiscriminativeSupervision {
+  // The weight we assign to this example;
+  // this will typically be one, but we include it
+  // for the sake of generality.  
+  BaseFloat weight; 
+  
+  // num_sequences will be 1 if you create a DiscriminativeSupervision object from a single
+  // lattice or alignment, but if you combine multiple DiscriminativeSupervision objects
+  // the 'num_sequences' is the number of objects that were combined (the
+  // lattices get appended).
+  int32 num_sequences;
+
+  // the number of frames in each sequence of appended objects.  num_frames *
+  // num_sequences must equal the path length of any path in the lattices.
+  // Technically this information is redundant with the lattices, but it's convenient
+  // to have it separately.
+  int32 frames_per_sequence;
+  
+  // The numerator alignment
+  std::vector<int32> num_ali;
+  
+  // Alternate alignment for debugging purposes; in the case of 
+  // semi-supervised training, this could hold the oracle alignment.
+  std::vector<int32> oracle_ali;
+  
+  // Frame weights, usually a value between 0 and 1 to indicate the 
+  // contribution of each frame to the objective function value.
+  // The default weight (when this vector is empty) is 1 for each frame.
+  std::vector<BaseFloat> weights;
+  
+  // Note: any acoustic
+  // likelihoods in the lattices will be
+  // recomputed at the time we train.
+
+  // Indicates whether a numerator lattice is present.
+  bool num_lat_present;
+  
+  // The numerator lattice
+  Lattice num_lat;
+  
+  // The denominator lattice.  
+  Lattice den_lat; 
+  
+  DiscriminativeSupervision(): weight(1.0), num_sequences(1),
+                               frames_per_sequence(-1), 
+                               num_lat_present(false) { }
+
+  DiscriminativeSupervision(const DiscriminativeSupervision &other);
+
+  void Swap(DiscriminativeSupervision *other);
+
+  bool operator == (const DiscriminativeSupervision &other) const;
+  
+  // This function checks that this supervision object satifsies some
+  // of the properties we expect of it, and calls KALDI_ERR if not.
+  void Check() const;
+  
+  int32 NumFrames() const { return num_sequences * frames_per_sequence; }
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+};
+
+/// This creates supervision alignment, denominator lattice and 
+/// optionally oracle alignment, frame weights 
+/// as required from discriminative objective functions.
+bool LatticeToDiscriminativeSupervision(
+    const std::vector<int32> &alignment,
+    const Lattice &lat,
+    BaseFloat weight,
+    DiscriminativeSupervision *supervision,
+    const Vector<BaseFloat> *weights = NULL,
+    const std::vector<int32> *oracle_alignment = NULL);
+
+/// This constructor is similar to the above but also uses a numerator
+/// lattice to create discriminative example.
+bool LatticeToDiscriminativeSupervision(
+    const std::vector<int32> &alignment,
+    const Lattice &num_lat,
+    const Lattice &den_lat,
+    BaseFloat weight,
+    DiscriminativeSupervision *supervision,
+    const Vector<BaseFloat> *weights = NULL,
+    const std::vector<int32> *oracle_alignment = NULL);
+
+/// This constructor is similar to the above but also uses a numerator
+/// posterior to create discriminative example.
+bool LatticeToDiscriminativeSupervision(
+    const std::vector<int32> &alignment,
+    const Posterior &num_post,
+    int32 dim,
+    const Lattice &lat,
+    BaseFloat weight,
+    DiscriminativeSupervision *supervision,
+    const Vector<BaseFloat> *weights = NULL,
+    const std::vector<int32> *oracle_alignment = NULL);
+
+// This class is used for splitting something of type
+// DiscriminativeSupervision into
+// multiple pieces corresponding to different frame-ranges.
+class DiscriminativeSupervisionSplitter {
+ public:
+  typedef fst::ArcTpl<LatticeWeight> LatticeArc;
+  typedef fst::VectorFst<LatticeArc> Lattice;
+ 
+  DiscriminativeSupervisionSplitter(
+      const SplitDiscriminativeSupervisionOptions &config,
+      const TransitionModel &tmodel,
+      const DiscriminativeSupervision &supervision);
+
+  struct LatticeInfo {
+    std::vector<double> alpha_p;
+    std::vector<double> beta_p;
+    //std::vector<double> alpha_r;
+    //std::vector<double> beta_r;
+    std::vector<int32> state_times;
+
+    void Check() const;
+  };
+  
+  // Extracts a frame range of the supervision into 'supervision'.  
+  void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
+                     bool normalize,
+                     DiscriminativeSupervision *supervision) const;
+
+  // Get the acoustic scaled denominator lattice out for debugging purposes
+  const Lattice& DenLat() const { return den_lat_; }  
+
+ private:
+
+  // Creates an output lattice covering frames begin_frame <= t < end_frame,
+  // assuming that the corresponding state-range that we need to
+  // include, begin_state <= s < end_state has been included.
+  // (note: the output lattice will also have two special initial and final
+  // states).  Does not do the post-processing (RmEpsilon, Determinize,
+  // TopSort on the result).  See code for details.
+  void CreateRangeLattice(const Lattice &in_lat,
+                          const LatticeInfo &scores,
+                          int32 begin_frame, int32 end_frame, bool normalize,
+                          Lattice *out_lat) const;
+
+  const SplitDiscriminativeSupervisionOptions &config_;
+  const TransitionModel &tmodel_;
+  const DiscriminativeSupervision &supervision_;
+
+  LatticeInfo num_lat_scores_;
+  LatticeInfo den_lat_scores_;
+
+  Lattice num_lat_;
+  Lattice den_lat_;
+  bool num_lat_present_;
+
+  void ComputeLatticeScores(const Lattice &lat, LatticeInfo *scores) const;
+  void PrepareLattice(Lattice *lat, LatticeInfo *scores) const;
+  void CollapseTransitionIds(const std::vector<int32> &state_times, 
+                             Lattice *lat) const;
+
+};
+
+/// This function appends a list of supervision objects to create what will
+/// usually be a single such object, but if the weights and num-frames are not
+/// all the same it will only append Supervision objects where successive ones
+/// have the same weight and num-frames, and if 'compactify' is true.  The
+/// normal use-case for this is when you are combining neural-net examples for
+/// training; appending them like this helps to simplify the decoding process.
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+                       bool compactify,
+                       std::vector<DiscriminativeSupervision> *output_supervision);
+
+// Extend a lattice *lat by appending a lattice src_lat at the end of it
+void AppendLattice(Lattice *lat, const Lattice &src_lat);
+
+typedef TableWriter<KaldiObjectHolder<DiscriminativeSupervision> > DiscriminativeSupervisionWriter;
+typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeSupervision> > SequentialDiscriminativeSupervisionReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeSupervision> > RandomAccessDiscriminativeSupervisionReader;
+
+} // namespace discriminative
+} // namespace kaldi
+
+#endif // KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
diff --git a/src/nnet3/discriminative-training-test.cc b/src/nnet3/discriminative-training-test.cc
new file mode 100644
index 00000000000..96fbefe6abd
--- /dev/null
+++ b/src/nnet3/discriminative-training-test.cc
@@ -0,0 +1,198 @@
+// nnet3bin/nnet3-discriminative-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/am-nnet-simple.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+void UnitTestMergeExamples(const TransitionModel &tmodel,
+                           const std::vector<int32> &silence_phones,
+                           const std::string criterion,
+                           bool one_silence_class,
+                           std::vector<NnetDiscriminativeExample> &examples, 
+                           NnetDiscriminativeComputeObjf *computer) {
+  NnetDiscriminativeExample merged_eg;
+  MergeDiscriminativeExamples(false, &examples, &merged_eg);
+
+  Posterior post_merged;
+  Posterior post_smbr_merged;
+
+  std::vector<discriminative::DiscriminativeTrainingStats> stats_list;
+
+  double objf = 0.0;
+
+  for (size_t i = 0; i < examples.size(); i++) {
+    Posterior post;
+    LatticeForwardBackward(examples[i].outputs[0].supervision.den_lat, &post, NULL);
+    Posterior post_smbr;
+    //LatticeForwardBackwardMpeVariants(tmodel, silence_phones, examples[i].outputs[0].supervision.den_lat, examples[i].outputs[0].supervision.num_ali, criterion, one_silence_class, &post_smbr);
+
+    post_merged.insert(post_merged.end(), post.begin(), post.end());
+    post_smbr_merged.insert(post_smbr_merged.end(), post_smbr.begin(), post_smbr.end());
+
+    computer->Compute(examples[i]);
+    const discriminative::DiscriminativeTrainingStats &stats = computer->Stats();
+    stats_list.push_back(stats);
+    objf += stats.TotalObjf(criterion);
+    computer->Reset();
+  }
+
+  Posterior merged_post;
+  LatticeForwardBackward(merged_eg.outputs[0].supervision.den_lat, &merged_post, NULL);
+  Posterior merged_post_smbr;
+  //LatticeForwardBackwardMpeVariants(tmodel, silence_phones, merged_eg.outputs[0].supervision.den_lat, merged_eg.outputs[0].supervision.num_ali, criterion, one_silence_class, &merged_post_smbr);
+
+  computer->Compute(merged_eg);
+  const discriminative::DiscriminativeTrainingStats &merged_stats = computer->Stats();
+  
+  KALDI_ASSERT(merged_post_smbr.size() == post_smbr_merged.size());
+  for (size_t i = 0; i < post_smbr_merged.size(); i++) {
+    for (size_t j = 0; j < post_smbr_merged[i].size(); j++) {
+      KALDI_ASSERT(post_smbr_merged[i][j].first == merged_post_smbr[i][j].first);
+      KALDI_ASSERT(kaldi::ApproxEqual(post_smbr_merged[i][j].second, merged_post_smbr[i][j].second));
+    }
+  }
+
+  KALDI_ASSERT(merged_post.size() == post_merged.size());
+  for (size_t i = 0; i < post_merged.size(); i++) {
+    for (size_t j = 0; j < post_merged[i].size(); j++) {
+      KALDI_ASSERT(post_merged[i][j].first == merged_post[i][j].first);
+      if (post_merged[i][j].second < 1e-6 && merged_post[i][j].second < 1e-6)
+        continue;
+      KALDI_ASSERT(kaldi::ApproxEqual(post_merged[i][j].second, merged_post[i][j].second));
+    }
+  }
+  
+  KALDI_ASSERT(ApproxEqual(merged_stats.TotalObjf(criterion), objf));
+  computer->Reset();
+}
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3 neural network parameters with discriminative sequence objective \n"
+        "gradient descent.  Minibatches are to be created by nnet3-discriminative-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-discriminative-train [options] <nnet-in> <discriminative-training-examples-in> \n"
+        "\n"
+        "nnet3-discriminative-train 1.mdl 'ark:nnet3-merge-egs 1.degs ark:-|' 2.raw\n";
+
+    std::string use_gpu = "yes";
+    bool compress = false;
+    int32 minibatch_size = 64;
+
+    NnetComputeProbOptions nnet_opts;
+    discriminative::DiscriminativeTrainingOptions discriminative_training_opts;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    nnet_opts.Register(&po);
+    discriminative_training_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    bool binary;
+    Input ki(model_rxfilename, &binary);
+    
+    tmodel.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    
+    NnetDiscriminativeComputeObjf discriminative_objf_computer(nnet_opts, 
+                                              discriminative_training_opts, 
+                                              tmodel, am_nnet.Priors(), am_nnet.GetNnet());
+
+    Nnet nnet = am_nnet.GetNnet();
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int64 num_read = 0;
+
+    std::vector<int32> silence_phones;
+    SplitStringToIntegers(discriminative_training_opts.silence_phones_str, ":", false, &silence_phones);
+
+    std::vector<NnetDiscriminativeExample> examples;
+    while (!example_reader.Done()) {
+      const NnetDiscriminativeExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        UnitTestMergeExamples(tmodel, silence_phones, 
+            discriminative_training_opts.criterion, 
+            discriminative_training_opts.one_silence_class, 
+            examples, &discriminative_objf_computer);
+        examples.clear();
+      }
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
new file mode 100644
index 00000000000..fef2cce3dd1
--- /dev/null
+++ b/src/nnet3/discriminative-training.cc
@@ -0,0 +1,709 @@
+// nnet3/discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-training.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+namespace discriminative {
+
+class DiscriminativeComputation {
+  typedef Lattice::Arc Arc;
+  typedef Arc::StateId StateId;
+ 
+ public:
+  DiscriminativeComputation(const DiscriminativeTrainingOptions &opts,
+                            const TransitionModel &tmodel,
+                            const CuVectorBase<BaseFloat> &log_priors,
+                            const DiscriminativeSupervision &supervision,
+                            const CuMatrixBase<BaseFloat> &nnet_output,
+                            DiscriminativeTrainingStats *stats,
+                            BaseFloat *l2_term,
+                            CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                            CuMatrixBase<BaseFloat> *xent_output_deriv);
+
+  void Compute();
+ 
+ private:
+  const DiscriminativeTrainingOptions &opts_;
+  const TransitionModel &tmodel_;
+  const CuVectorBase<BaseFloat> &log_priors_;
+  const DiscriminativeSupervision &supervision_;
+  const CuMatrixBase<BaseFloat> &nnet_output_;
+
+  DiscriminativeTrainingStats *stats_;
+
+  BaseFloat *l2_term_;
+
+  CuMatrixBase<BaseFloat> *nnet_output_deriv_;
+  CuMatrixBase<BaseFloat> *xent_output_deriv_;
+
+  Lattice num_lat_;
+  bool num_lat_present_;
+  Lattice den_lat_;
+
+  std::vector<int32> silence_phones_;
+
+  double ComputeObjfAndDeriv(Posterior *post, Posterior *xent_post);
+  static inline Int32Pair MakePair(int32 first, int32 second) {
+    Int32Pair ans;
+    ans.first = first;
+    ans.second = second;
+    return ans;
+  }
+};
+
+DiscriminativeComputation::DiscriminativeComputation(
+                            const DiscriminativeTrainingOptions &opts,
+                            const TransitionModel &tmodel,
+                            const CuVectorBase<BaseFloat> &log_priors,
+                            const DiscriminativeSupervision &supervision,
+                            const CuMatrixBase<BaseFloat> &nnet_output,
+                            DiscriminativeTrainingStats *stats,
+                            BaseFloat *l2_term,
+                            CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                            CuMatrixBase<BaseFloat> *xent_output_deriv)
+  : opts_(opts), tmodel_(tmodel), log_priors_(log_priors), 
+  supervision_(supervision), nnet_output_(nnet_output),
+  stats_(stats), 
+  l2_term_(l2_term),
+  nnet_output_deriv_(nnet_output_deriv), 
+  xent_output_deriv_(xent_output_deriv),
+  num_lat_present_(supervision.num_lat_present) {
+  if (num_lat_present_) {
+    num_lat_ = supervision.num_lat;
+    TopSort(&num_lat_);
+  }
+  
+  den_lat_ = supervision.den_lat;
+  TopSort(&den_lat_);
+  
+  if (!SplitStringToIntegers(opts_.silence_phones_str, ":", false,
+                             &silence_phones_)) {
+    KALDI_ERR << "Bad value for --silence-phones option: "
+              << opts_.silence_phones_str;
+  }
+}
+
+void DiscriminativeComputation::Compute() {
+  if (opts_.criterion == "mmi" && opts_.boost != 0.0) {
+    BaseFloat max_silence_error = 0.0;
+    LatticeBoost(tmodel_, supervision_.num_ali, silence_phones_,
+                 opts_.boost, max_silence_error, &den_lat_);
+  }
+
+  int32 num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  
+  int32 num_pdfs = nnet_output_.NumCols();
+  KALDI_ASSERT(log_priors_.Dim() == 0 || num_pdfs == log_priors_.Dim());
+  
+  // We need to look up the posteriors of some pdf-ids in the matrix
+  // "posteriors".  Rather than looking them all up using operator (), which is
+  // very slow because each lookup involves a separate CUDA call with
+  // communication over PciExpress, we look them up all at once using
+  // CuMatrix::Lookup().
+  
+  std::vector<Int32Pair> requested_indexes;
+  BaseFloat wiggle_room = 1.3; // value not critical.. it's just 'reserve'
+  
+  int32 num_reserve = wiggle_room * den_lat_.NumStates();
+  
+  if (opts_.criterion == "mmi") {
+    // For looking up the posteriors corresponding to the pdfs in the alignment
+    num_reserve += num_frames;
+  } else if (opts_.criterion == "nce" || opts_.criterion == "empfe" || 
+             opts_.criterion == "esmbr") {
+    // For looking up the posteriors corresponding to the pdfs in the 
+    // numerator lattice or the numerator posteriors
+    if (supervision_.num_lat_present) num_reserve *= 2;
+  }
+
+  requested_indexes.reserve(num_reserve);
+  
+  // Denominator probabilities to look up from denominator lattice
+  std::vector<int32> state_times;
+  int32 T = LatticeStateTimes(den_lat_, &state_times);
+  KALDI_ASSERT(T == num_frames);
+  
+  StateId num_states = den_lat_.NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    int32 seq = t / supervision_.frames_per_sequence, 
+          idx = t % supervision_.frames_per_sequence;
+
+    for (fst::ArcIterator<Lattice> aiter(den_lat_, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+        // Intuitive order: requested_indexes.push_back(MakePair(t, pdf_id));
+        requested_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+      }
+    }
+  }
+
+  if (opts_.criterion == "mmi") {
+    // Numerator probabilities to look up from alignment
+    for (int32 t = 0; t < num_frames; t++) {
+      int32 seq = t / supervision_.frames_per_sequence, 
+            idx = t % supervision_.frames_per_sequence;
+      int32 tid = supervision_.num_ali[t], pdf_id = tmodel_.TransitionIdToPdf(tid);
+      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
+      // Intuitive order: requested_indexes.push_back(MakePair(t, pdf_id));
+      requested_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+    }
+  } else if (opts_.criterion == "nce" || opts_.criterion == "empfe" || opts_.criterion == "esmbr") {
+    if (supervision_.num_lat_present) {
+      // Numerator probabilities to look up from numerator lattice
+      std::vector<int32> state_times;
+      int32 T = LatticeStateTimes(num_lat_, &state_times);
+      KALDI_ASSERT(T == num_frames);
+
+      StateId num_states = num_lat_.NumStates();
+      for (StateId s = 0; s < num_states; s++) {
+        int32 t = state_times[s];
+        int32 seq = t / supervision_.frames_per_sequence, 
+              idx = t % supervision_.frames_per_sequence;
+        for (fst::ArcIterator<Lattice> aiter(num_lat_, s); !aiter.Done(); aiter.Next()) {
+          const Arc &arc = aiter.Value();
+          if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+            int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+            requested_indexes.push_back(MakePair(idx * supervision_.frames_per_sequence + seq, pdf_id));
+          }
+        }
+      }
+    }
+  }
+  
+  std::vector<BaseFloat> answers;
+  CuArray<Int32Pair> cu_requested_indexes(requested_indexes);
+  answers.resize(requested_indexes.size());
+  nnet_output_.Lookup(cu_requested_indexes, &(answers[0]));
+  // requested_indexes now contain (t, j) pair and answers contains the 
+  // corresponding log p(j|x(t)) as given by the neural network
+  
+  int32 num_floored = 0;
+
+  BaseFloat floor_val = -20 * kaldi::Log(10.0); // floor for posteriors.
+  size_t index;
+
+  Vector<BaseFloat> log_priors(log_priors_);
+  
+  // Replace "answers" with the vector of scaled log-probs.  If this step takes
+  // too much time, we can look at other ways to do it, using the CUDA card.
+  for (index = 0; index < answers.size(); index++) {
+    BaseFloat log_post = answers[index];
+    if (log_post < floor_val) {
+      log_post = floor_val;
+      num_floored++;
+    }
+
+    if (log_priors_.Dim() > 0) {
+      int32 pdf_id = requested_indexes[index].second;
+      KALDI_ASSERT(log_post <= 0 && log_priors(pdf_id) <= 0);
+      BaseFloat pseudo_loglike = (log_post - log_priors(pdf_id)) * opts_.acoustic_scale;
+      KALDI_ASSERT(!KALDI_ISINF(pseudo_loglike) && !KALDI_ISNAN(pseudo_loglike));
+      answers[index] = pseudo_loglike;
+    } else {
+      answers[index] = log_post * opts_.acoustic_scale;
+    }
+  }
+  
+  if (num_floored > 0) {
+    KALDI_WARN << "Floored " << num_floored << " probabilities from nnet.";
+  }
+
+  index = 0;
+  
+  // Now put the negative (scaled) acoustic log-likelihoods in the lattice.
+  for (StateId s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<Lattice> aiter(&den_lat_, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        arc.weight.SetValue2(-answers[index]);
+        index++;
+        aiter.SetValue(arc);
+      }
+    }
+    LatticeWeight final = den_lat_.Final(s);
+    if (final != LatticeWeight::Zero()) {
+      final.SetValue2(0.0); // make sure no acoustic term in final-prob.
+      den_lat_.SetFinal(s, final);
+    }
+  }
+  
+  DiscriminativeTrainingStats this_stats;
+  if (stats_) 
+    this_stats.SetConfig(stats_->config);
+  
+  // Look up numerator probabilities corresponding to alignment
+  if (opts_.criterion == "mmi") {
+    double tot_num_like = 0.0;
+    KALDI_ASSERT(index + supervision_.num_ali.size() == answers.size());
+    for (size_t this_index = 0; this_index < supervision_.num_ali.size(); this_index++) {
+      tot_num_like += answers[index + this_index];
+      // Intuitive order: KALDI_ASSERT(requested_indexes[index + this_index].first == this_index && requested_indexes[index+this_index].second == tmodel_.TransitionIdToPdf(supervision_.num_ali[this_index]));
+    }
+    //KALDI_ASSERT(tot_num_like > 0); // In general, this must be positive because log_post is larger than log_prior for the correct labels
+    this_stats.tot_num_objf += supervision_.weight * tot_num_like;
+    index += supervision_.num_ali.size();
+  } else if (opts_.criterion == "nce" || opts_.criterion == "empfe" || opts_.criterion == "esmbr") {
+    if (supervision_.num_lat_present) {
+      // Now put the negative (scaled) acoustic log-likelihoods in the 
+      // numerator lattice.
+      for (StateId s = 0; s < num_lat_.NumStates(); s++) {
+        for (fst::MutableArcIterator<Lattice> aiter(&num_lat_, s);
+            !aiter.Done(); aiter.Next()) {
+          Arc arc = aiter.Value();
+          if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+            arc.weight.SetValue2(-answers[index]);
+            index++;
+            aiter.SetValue(arc);
+          }
+        }
+        LatticeWeight final = num_lat_.Final(s);
+        if (final != LatticeWeight::Zero()) {
+          final.SetValue2(0.0); // make sure no acoustic term in final-prob.
+          num_lat_.SetFinal(s, final);
+        }
+      }
+    } 
+  }
+
+  KALDI_ASSERT(index == answers.size());
+  
+  if (nnet_output_deriv_) {
+    nnet_output_deriv_->SetZero();
+    KALDI_ASSERT(nnet_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        nnet_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  if (xent_output_deriv_) {
+    xent_output_deriv_->SetZero();
+    KALDI_ASSERT(xent_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        xent_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  Posterior post;
+  Posterior xent_post;
+  double objf = ComputeObjfAndDeriv(&post, (xent_output_deriv_ ? &xent_post : NULL));
+  
+  this_stats.tot_objf += supervision_.weight * objf;
+  
+  KALDI_ASSERT(nnet_output_.NumRows() == post.size());
+  
+  CuMatrix<BaseFloat> output_deriv;
+  
+  CuMatrixBase<BaseFloat> *output_deriv_temp; 
+  
+  if (nnet_output_deriv_) 
+    output_deriv_temp = nnet_output_deriv_;
+  else {
+    output_deriv.Resize(nnet_output_.NumRows(), nnet_output_.NumCols());
+    output_deriv_temp = &output_deriv;
+  }
+  
+  double tot_num_post = 0.0, tot_post = 0.0, tot_den_post = 0.0;
+  {
+    std::vector<Int32Pair> deriv_indexes;
+    std::vector<BaseFloat> deriv_data;
+    for (size_t t = 0; t < post.size(); t++) {
+      for (size_t j = 0; j < post[t].size(); j++) {
+        int32 seq = t / supervision_.frames_per_sequence, 
+              idx = t % supervision_.frames_per_sequence;
+        int32 pdf_id = post[t][j].first;
+        deriv_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+        BaseFloat weight = post[t][j].second;
+        if (weight > 0.0) tot_num_post += weight;
+        if (weight < 0.0) tot_den_post -= weight;
+        deriv_data.push_back(weight);
+      }
+    }
+    CuArray<Int32Pair> cu_deriv_indexes(deriv_indexes);
+    output_deriv_temp->AddElements(supervision_.weight, cu_deriv_indexes, deriv_data.data());
+  }
+
+  if (xent_output_deriv_) {
+    std::vector<Int32Pair> deriv_indexes;
+    std::vector<BaseFloat> deriv_data;
+    for (size_t t = 0; t < xent_post.size(); t++) {
+      for (size_t j = 0; j < xent_post[t].size(); j++) {
+        int32 seq = t / supervision_.frames_per_sequence, 
+              idx = t % supervision_.frames_per_sequence;
+        int32 pdf_id = xent_post[t][j].first;
+        deriv_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+        deriv_data.push_back(xent_post[t][j].second);
+      }
+    }
+    CuArray<Int32Pair> cu_deriv_indexes(deriv_indexes);
+    xent_output_deriv_->AddElements(supervision_.weight, cu_deriv_indexes, deriv_data.data());
+  }
+
+  if (opts_.criterion == "nce") {
+    tot_post = tot_num_post + tot_den_post;
+    tot_num_post = tot_den_post = 0.0;
+  }
+
+  this_stats.tot_gradients += tot_post;
+  this_stats.tot_den_count += tot_den_post;
+  this_stats.tot_num_count += tot_num_post;
+
+  if (this_stats.AccumulateGradients()) 
+    (this_stats.gradients).AddRowSumMat(1.0, CuMatrix<double>(*output_deriv_temp));
+  if (this_stats.AccumulateOutput()) {
+    CuMatrix<double> temp(nnet_output_);
+    temp.ApplyExp();
+    (this_stats.output).AddRowSumMat(1.0, temp);
+  }
+  
+  this_stats.tot_t = T;
+  this_stats.tot_t_weighted = T * supervision_.weight;
+  
+  if (!(this_stats.TotalObjf(opts_.criterion) == this_stats.TotalObjf(opts_.criterion))) {
+    // inf or NaN detected
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->SetZero();
+    BaseFloat default_objf = -10;
+    KALDI_WARN << "Objective function is " << this_stats.TotalObjf(opts_.criterion)
+               << ", setting to " << default_objf << " per frame.";
+    this_stats.tot_objf = default_objf * this_stats.tot_t_weighted;
+  }
+  
+  if (GetVerboseLevel() >= 2) {
+    if (GetVerboseLevel() >= 3) {
+      this_stats.Print(opts_.criterion, true, true, true);
+    } else 
+      this_stats.Print(opts_.criterion);
+  }
+
+  if (stats_)
+    stats_->Add(this_stats);
+
+  // This code helps us see how big the derivatives are, on average,
+  // for different frames of the sequences.  As expected, they are
+  // smaller towards the edges of the sequences (due to the penalization
+  // of 'incorrect' pdf-ids.
+  if (nnet_output_deriv_ && GetVerboseLevel() >= 1) {
+    int32 tot_frames = nnet_output_deriv_->NumRows(),
+ frames_per_sequence = supervision_.frames_per_sequence,
+       num_sequences = supervision_.num_sequences;
+    CuVector<BaseFloat> row_products(tot_frames);
+    row_products.AddDiagMat2(1.0, *nnet_output_deriv_, kNoTrans, 0.0);
+    Vector<BaseFloat> row_products_cpu(row_products);
+    Vector<BaseFloat> row_products_per_frame(frames_per_sequence);
+    for (int32 i = 0; i < tot_frames; i++)
+      row_products_per_frame(i / num_sequences) += row_products_cpu(i);
+    KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
+  }
+  
+  if (opts_.l2_regularize != 0.0) {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision_.weight * opts_.l2_regularize;
+    *l2_term_ += -0.5 * scale * TraceMatMat(nnet_output_, nnet_output_, kTrans);
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->AddMat(-1.0 * scale, nnet_output_);
+  }
+}
+
+double DiscriminativeComputation::ComputeObjfAndDeriv(Posterior *post, Posterior *xent_post) {
+
+  if (xent_post) {
+    Posterior tid_post;
+    if (opts_.criterion == "mpfe" || opts_.criterion == "smbr" || opts_.criterion == "mmi") {
+      AlignmentToPosterior(supervision_.num_ali,
+          &tid_post);
+    } else {
+      if (supervision_.num_lat_present) {
+        LatticeForwardBackward(supervision_.num_lat,
+          &tid_post);
+      } else {
+        LatticeForwardBackward(supervision_.den_lat,
+          &tid_post);
+      }
+    }
+    ConvertPosteriorToPdfs(tmodel_, tid_post, xent_post);
+  }
+
+  if (opts_.criterion == "mpfe" || opts_.criterion == "smbr") {
+    Posterior tid_post;
+    double ans = LatticeForwardBackwardMpeVariants(tmodel_, silence_phones_, den_lat_,
+        supervision_.num_ali, opts_.criterion,
+        opts_.one_silence_class,
+        &tid_post);
+    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
+    return ans;
+  } else if (opts_.criterion == "mmi") {
+    bool convert_to_pdfs = true, cancel = true;
+    // we'll return the denominator-lattice forward backward likelihood,
+    // which is one term in the objective function.
+    return (LatticeForwardBackwardMmi(tmodel_, den_lat_, supervision_.num_ali,
+                                      opts_.drop_frames, convert_to_pdfs,
+                                      cancel, post));
+  } else if (opts_.criterion == "nce") {
+    Posterior tid_post;
+    SignedLogDouble obj_func;
+
+    if (supervision_.weights.size() > 0)
+      obj_func = LatticeForwardBackwardNce(tmodel_, den_lat_, &tid_post, &supervision_.weights, opts_.weight_threshold);
+    else
+      obj_func = LatticeForwardBackwardNce(tmodel_, den_lat_, &tid_post);
+
+    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
+    return obj_func.Value(); // returns the objective function.
+  } else if (opts_.criterion == "empfe" || opts_.criterion == "esmbr") {
+    double obj_func;
+    Posterior tid_post;
+    
+    for (int32 debug_run = 0; debug_run <= 1; debug_run++) {
+      if (opts_.debug_training) {
+        KALDI_ASSERT(supervision_.oracle_ali.size() > 0);
+        Posterior oracle_post;
+        std::vector<std::string> debug_criteria;
+
+        if (opts_.debug_training_advanced) {
+          debug_criteria.resize(2);
+          debug_criteria[0] = "smbr";
+          debug_criteria[1] = "mpfe";
+          debug_criteria.push_back("empfe");
+          debug_criteria.push_back("esmbr");
+        }
+
+        double pdf_accuracy = 0.0, weighted_pdf_accuracy = 0.0;
+        double phone_accuracy = 0.0, weighted_phone_accuracy = 0.0;
+
+        for (size_t i = 0; i < supervision_.NumFrames(); i++) {
+          phone_accuracy += ( tmodel_.TransitionIdToPhone(supervision_.num_ali[i]) == tmodel_.TransitionIdToPhone(supervision_.oracle_ali[i]) );
+          pdf_accuracy += ( tmodel_.TransitionIdToPdf(supervision_.num_ali[i]) == tmodel_.TransitionIdToPdf(supervision_.oracle_ali[i]) );
+
+          weighted_phone_accuracy += supervision_.weights[i] * ( tmodel_.TransitionIdToPhone(supervision_.num_ali[i]) == tmodel_.TransitionIdToPhone(supervision_.oracle_ali[i]) ) + (1 - supervision_.weights[i]) * ( tmodel_.TransitionIdToPhone(supervision_.num_ali[i]) != tmodel_.TransitionIdToPhone(supervision_.oracle_ali[i]) );
+          weighted_pdf_accuracy += supervision_.weights[i] * ( tmodel_.TransitionIdToPdf(supervision_.num_ali[i]) == tmodel_.TransitionIdToPdf(supervision_.oracle_ali[i]) ) + (1 - supervision_.weights[i]) * ( tmodel_.TransitionIdToPdf(supervision_.num_ali[i]) != tmodel_.TransitionIdToPdf(supervision_.oracle_ali[i]) );
+        }
+
+        double expected_pdf_accuracy = 
+          LatticeForwardBackwardEmpeVariants(tmodel_, 
+              silence_phones_, num_lat_, supervision_.oracle_ali, NULL, NULL,
+              "smbr", false, 0, 
+              &oracle_post, 0.0);
+        double expected_phone_accuracy = 
+          LatticeForwardBackwardEmpeVariants(tmodel_, 
+              silence_phones_, num_lat_, supervision_.oracle_ali, NULL, NULL,
+              "mpfe", false, 0, 
+              &oracle_post, 0.0);
+
+        for (std::vector<std::string>::const_iterator it = debug_criteria.begin();
+            it != debug_criteria.end(); ++it) {
+          double obj_func_best_path = 
+            LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.num_ali, NULL, &num_lat_,
+                *it, false, 0, 
+                &oracle_post, 0.0);
+          double obj_func_best_path_weighted = 
+            LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.num_ali, NULL, &num_lat_,
+                *it, false, 0, 
+                &oracle_post, 0.0, &supervision_.weights);
+          double obj_func_oracle = 
+            LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.oracle_ali, NULL, &num_lat_,
+                *it, false, 0, 
+                &oracle_post, 0.0);
+
+          KALDI_LOG << "self-training " << *it 
+            << ": " << obj_func_best_path / supervision_.NumFrames()
+            << "; weighted self-training " << *it 
+            << ": " << obj_func_best_path_weighted / supervision_.NumFrames();
+          if (*it == "smbr" || *it == "mpfe")
+            KALDI_LOG << "oracle " << *it 
+              << ": " << obj_func_oracle / supervision_.NumFrames();
+        }
+        KALDI_LOG << "pdf accuracy : " << pdf_accuracy / supervision_.NumFrames()
+          << "; phone accuracy : " << phone_accuracy / supervision_.NumFrames();
+        KALDI_LOG << "weighted pdf accuracy : " << weighted_pdf_accuracy / supervision_.NumFrames()
+          << "; weighted phone accuracy : " << weighted_phone_accuracy / supervision_.NumFrames();
+        KALDI_LOG << "expected pdf accuracy : " << expected_pdf_accuracy / supervision_.NumFrames()
+          << "; expected phone accuracy : " << expected_phone_accuracy / supervision_.NumFrames();
+      }
+
+      if (debug_run == 0) {
+        /*if (!supervision_.num_lat_present && supervision_.num_post.size() > 0) {
+          // Using numerator posteriors
+          obj_func = LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.num_ali, &supervision_.num_post, NULL,
+                opts_.criterion, opts_.one_silence_class, opts_.deletion_penalty, 
+                &tid_post, opts_.weight_threshold);
+        } else */ if (supervision_.num_lat_present) {
+          // Using numerator lattice
+          obj_func = LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.num_ali, NULL, &num_lat_, 
+                opts_.criterion, opts_.one_silence_class, opts_.deletion_penalty, 
+                &tid_post, opts_.weight_threshold);
+        } else {
+          // Using denominator lattice
+          obj_func = LatticeForwardBackwardEmpeVariants(tmodel_, 
+                silence_phones_, den_lat_, supervision_.num_ali, NULL, NULL,
+                opts_.criterion, opts_.one_silence_class, opts_.deletion_penalty, 
+                &tid_post, opts_.weight_threshold);
+        }
+        ConvertPosteriorToPdfs(tmodel_, tid_post, post);
+        KALDI_ASSERT(post->size() == supervision_.NumFrames());
+        if (opts_.debug_training)
+          KALDI_LOG << opts_.criterion << ": " << obj_func / supervision_.NumFrames();
+      }
+    }
+
+    return obj_func;
+
+  } else {
+    KALDI_ERR << "Unknown criterion " << opts_.criterion;
+  }
+
+  return 0;
+}
+
+
+void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeTrainingOptions &opts,
+                                       const TransitionModel &tmodel,
+                                       const CuVectorBase<BaseFloat> &log_priors,
+                                       const DiscriminativeSupervision &supervision,
+                                       const CuMatrixBase<BaseFloat> &nnet_output,
+                                       DiscriminativeTrainingStats *stats,
+                                       BaseFloat *l2_term,
+                                       CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                                       CuMatrixBase<BaseFloat> *xent_output_deriv) {
+  DiscriminativeComputation computation(opts, tmodel, log_priors, supervision, nnet_output, stats, l2_term, nnet_output_deriv, xent_output_deriv);
+  computation.Compute();
+}
+
+void DiscriminativeTrainingStats::Add(const DiscriminativeTrainingStats &other) {
+  tot_t += other.tot_t;
+  tot_t_weighted += other.tot_t_weighted;
+  tot_objf += other.tot_objf;             // Actually tot_den_objf for mmi
+  tot_gradients += other.tot_gradients;   // Only for nce 
+  tot_num_count += other.tot_num_count;   // Not for nce
+  tot_den_count += other.tot_den_count;   // Not for nce
+  tot_num_objf += other.tot_num_objf;     // Only for mmi
+  
+  if (AccumulateGradients()) {
+    gradients.AddVec(1.0, other.gradients);
+  } 
+  if (AccumulateOutput()) {
+    output.AddVec(1.0, other.output);
+  }
+}
+
+void DiscriminativeTrainingStats::Print(const std::string &criterion, 
+                                    bool print_avg_gradients, 
+                                    bool print_avg_output,
+                                    bool print_avg_counts) const {
+  if (criterion == "mmi") {
+    double num_objf = tot_num_objf / tot_t_weighted,
+           den_objf = tot_objf / tot_t_weighted;
+    double objf = num_objf - den_objf;
+
+    double avg_post_per_frame = tot_num_count / tot_t_weighted;
+
+    KALDI_LOG << "Number of frames is " << tot_t
+              << " (weighted: " << tot_t_weighted
+              << "), average (num or den) posterior per frame is "
+              << avg_post_per_frame;
+    KALDI_LOG << "MMI objective function is " << num_objf << " - "
+              << den_objf << " = " << objf << " per frame, over "
+              << tot_t_weighted << " frames.";
+  } else if (criterion == "mpfe") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of MPFE gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "MPFE objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  } else if (criterion == "smbr") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of SMBR gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "SMBR objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  } else if (criterion == "nce") {
+    double avg_gradients = (tot_gradients) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of NCE gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "NCE objective function is " << objf << " per frame, over "
+              << tot_t_weighted << " frames";
+  } else if (criterion == "esmbr") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of ESMBR gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "ESMBR objective function is " << objf << " per frame, over "
+              << tot_t_weighted << " frames";
+  } else if (criterion == "empfe") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of EMPFE gradients is " << avg_gradients 
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "EMPFE objective function is " << objf << " per frame, over "
+              << tot_t_weighted << " frames";
+  }
+  
+  if (AccumulateGradients()) {
+    {
+      Vector<double> temp(gradients);
+      temp.Scale(1.0/tot_t_weighted);
+      if (print_avg_gradients) {
+        KALDI_LOG << "Vector of average gradients wrt output activations is: \n" << temp;
+      } else {
+        KALDI_VLOG(4) << "Vector of average gradients wrt output activations is: \n" << temp;
+      }
+    }
+  }
+  if (AccumulateOutput()) {
+    {
+      Vector<double> temp(output);
+      temp.Scale(1.0/tot_t_weighted);
+      if (print_avg_output) {
+        KALDI_LOG << "Average DNN posterior is: \n" << temp;
+      } else {
+        KALDI_VLOG(4) << "Average DNN posterior is: \n" << temp;
+      }
+    }
+  }
+}
+
+void DiscriminativeTrainingStats::PrintAvgGradientForPdf(int32 pdf_id) const {
+  if (pdf_id < gradients.Dim() and pdf_id >= 0) {
+    KALDI_LOG << "Average gradient wrt output activations of pdf " << pdf_id 
+      << " is " << gradients(pdf_id) / tot_t_weighted
+      << " per frame, over "
+      << tot_t_weighted << " frames";
+  } 
+}
+
+
+
+}  // namespace discriminative
+}  // namespace kaldi
+
diff --git a/src/nnet3/discriminative-training.h b/src/nnet3/discriminative-training.h
new file mode 100644
index 00000000000..3148c5fc22d
--- /dev/null
+++ b/src/nnet3/discriminative-training.h
@@ -0,0 +1,257 @@
+// nnet3/discriminative-training.h
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+namespace kaldi {
+namespace discriminative {
+
+struct DiscriminativeTrainingOptions {
+  std::string criterion; // "mmi" or "mpfe" or "smbr" or "nce" or "empfe" or "esmbr"
+                         // If the criterion does not match the supervision
+                         // object, the derivatives may not be very accurate
+  BaseFloat acoustic_scale; // e.g. 0.1
+  bool drop_frames; // for MMI, true if we ignore frames where alignment
+                    // pdf-id is not in the lattice.
+  bool one_silence_class;  // Affects MPFE/SMBR/EMPFE/ESMBR
+  BaseFloat deletion_penalty;     // e.g. 0.1. Affects ESMBR and EMPFE.
+  BaseFloat boost; // for MMI, boosting factor (would be Boosted MMI)... e.g. 0.1.
+  
+  std::string silence_phones_str; // colon-separated list of integer ids of silence phones,
+                                  // for MPFE/SMBR/EMPFE/ESMBR only.
+
+  BaseFloat weight_threshold; // e.g. 0.0
+  BaseFloat xent_regularize;
+  BaseFloat l2_regularize;
+  bool debug_training;
+  bool debug_training_advanced;
+
+  DiscriminativeTrainingOptions(): criterion("smbr"), acoustic_scale(0.1),
+                                   drop_frames(false),
+                                   one_silence_class(false),
+                                   deletion_penalty(0.0),
+                                   boost(0.0), weight_threshold(0.0),
+                                   xent_regularize(0.0), l2_regularize(0.0),
+                                   debug_training(false), 
+                                   debug_training_advanced(false) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr'|'nce'|'empfe'|'esmbr', "
+                   "determines the objective function to use.  Should match "
+                   "option used when we created the examples.");
+    opts->Register("acoustic-scale", &acoustic_scale, "Weighting factor to "
+                   "apply to acoustic likelihoods.");
+    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
+                   "with no overlap of num and den frames");
+    opts->Register("boost", &boost, "Boosting factor for boosted MMI (e.g. 0.1)");
+    opts->Register("one-silence-class", &one_silence_class, "If true, newer "
+                   "behavior which will tend to reduce insertions.");
+    opts->Register("deletion-penalty", &deletion_penalty, "Penalize deletions "
+                   "by favoring paths that don't have deletions.");
+    opts->Register("silence-phones", &silence_phones_str,
+                   "For MPFE or SMBR, colon-separated list of integer ids of "
+                   "silence phones, e.g. 1:2:3");
+    opts->Register("weight-threshold", &weight_threshold, 
+                   "Ignore frames below a confidence threshold");
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' training, applied to the output "
+                   "of the neural net.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for 'chain' training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
+    opts->Register("debug-training", &debug_training,
+                   "Debug training using oracle alignment. Gives error "
+                   "if oracle alignment is not found in examples");
+    opts->Register("debug-training-advanced", &debug_training_advanced,
+                   "Debug training using oracle alignment with all objective functions. "
+                   "Gives error if oracle alignment is not found in examples");
+  }
+};
+
+struct DiscriminativeTrainingStatsOptions {
+  bool accumulate_gradients;
+  bool accumulate_output;
+  bool accumulate_counts;
+  int32 num_pdfs;
+
+  void Register(OptionsItf *opts) {
+    opts->Register("accumulate-gradients", &accumulate_gradients,
+                   "Accumulate gradients for debugging discriminative training");
+    opts->Register("accumulate-counts", &accumulate_counts,
+                   "Accumulate indicator counts of denominator pdfs " 
+                   "for debugging discriminative training");
+    opts->Register("accumulate-output", &accumulate_output,
+                   "Accumulate nnet output "
+                   "for debugging discriminative training");
+    opts->Register("num-pdfs", &num_pdfs,
+                   "Number of pdfs");
+  }
+
+  DiscriminativeTrainingStatsOptions() :
+    accumulate_gradients(false), accumulate_output(false),
+    accumulate_counts(false), num_pdfs(0) { }
+};
+
+struct DiscriminativeTrainingStats {
+  double tot_t;         // total number of frames
+  double tot_t_weighted; // total number of frames times weight.
+  double tot_objf;      // for MMI, the (weighted) denominator likelihood; for
+                        // everything else, the objective function.
+  double tot_gradients; // for NCE, the gradients, for everything else 0
+  double tot_num_count; // total count of numerator posterior for everything but NCE
+  double tot_den_count; // total count of denominator posterior for everything but NCE
+  double tot_num_objf;  // for MMI, the (weighted) numerator likelihood; for
+                        // everything else 0
+
+  DiscriminativeTrainingStatsOptions config;
+
+  CuVector<double> gradients;
+  CuVector<double> output;
+
+  DiscriminativeTrainingStats(int32 num_pdfs) {
+    config.accumulate_gradients = false; 
+    config.accumulate_output = false; 
+    config.accumulate_counts = false;
+    config.num_pdfs = num_pdfs; 
+    gradients.Resize(num_pdfs); 
+    output.Resize(num_pdfs);
+  }
+
+  DiscriminativeTrainingStats() {
+    std::memset(this, 0, sizeof(*this));
+    config.accumulate_gradients = false; 
+    config.accumulate_output = false; 
+    config.accumulate_counts = false;
+    config.num_pdfs = 0; 
+  }
+
+  DiscriminativeTrainingStats(DiscriminativeTrainingStatsOptions opts) : config(opts) { 
+    gradients.Resize(opts.num_pdfs); 
+    output.Resize(opts.num_pdfs);
+  }
+  
+  void Reset() {
+    gradients.SetZero();
+    output.SetZero();
+    
+    tot_t = 0.0;
+    tot_t_weighted = 0.0;
+    tot_objf = 0.0;
+    tot_gradients = 0.0;
+    tot_num_count = 0.0;
+    tot_den_count = 0.0;
+    tot_num_objf = 0.0;
+  }
+  
+  void SetConfig(const DiscriminativeTrainingStatsOptions &opts) {
+    config = opts;
+    gradients.Resize(opts.num_pdfs); 
+    output.Resize(opts.num_pdfs);
+  }
+
+  void Print(const std::string &criterion, 
+             bool print_avg_gradients = false, 
+             bool print_avg_output = false,
+             bool print_avg_counts = false) const;
+
+  void PrintAll(const std::string &criterion) const {
+    Print(criterion, true, true, true);
+  }
+
+  void PrintAvgGradientForPdf(int32 pdf_id) const;
+  void Add(const DiscriminativeTrainingStats &other);
+
+  inline double TotalObjf(const std::string &criterion) const {
+    if (criterion == "mmi") return (tot_num_objf - tot_objf);
+    return tot_objf;
+  }
+
+  inline double TotalT() const {
+    return tot_t_weighted;
+  }
+
+  inline bool AccumulateGradients() const {
+    return config.accumulate_gradients && gradients.Dim() > 0;
+  }
+
+  inline bool AccumulateOutput() const {
+    return config.accumulate_output && output.Dim() > 0;
+  }
+};
+
+/**
+   This function does forward-backward on the numerator and denominator 
+   lattices and computes derivates wrt to the output for the specified 
+   objective function.
+
+   @param [in] opts        Struct containing options
+   @param [in] supervision  The supervision object, containing the numerator
+                            and denominator paths. The denominator is 
+                            always a lattice. The numerator can either be 
+                            a lattice or an alignment.
+   @param [in] nnet_output  The output of the neural net; dimension must equal
+                          ((supervision.num_sequences * supervision.frames_per_sequence) by
+                            den_graph.NumPdfs()).
+   @param [out] objf       The objective function computed for this
+                           example; you'll want to divide it by 'tot_weight' before
+                           displaying it.
+   @param [out] weight     The weight to normalize the objective function by;
+                           equals supervision.weight * supervision.num_sequences *
+                           supervision.frames_per_sequence.
+   @param [out] nnet_output_deriv  The derivative of the objective function w.r.t.
+                           the neural-net output.  Only written to if non-NULL.
+                           You don't have to zero this before passing to this function,
+                           we zero it internally.
+*/
+void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeTrainingOptions &opts,
+                                       const TransitionModel &tmodel,
+                                       const CuVectorBase<BaseFloat> &log_priors,
+                                       const DiscriminativeSupervision &supervision,
+                                       const CuMatrixBase<BaseFloat> &nnet_output,
+                                       DiscriminativeTrainingStats *stats,
+                                       BaseFloat *l2_term,
+                                       CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                                       CuMatrixBase<BaseFloat> *xent_output_deriv);
+
+}  // namespace discriminative
+}  // namespace kaldi
+
+#endif  // KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+
diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index a2c6d07105b..9f4136a26b1 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -307,7 +307,7 @@ void UnitTestPreconditionDirectionsOnline() {
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual(row_prod1, row_prod2, 1.0e-02f);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 3b351256090..5a3de5c6074 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -112,9 +112,9 @@ void NnetDecodableBase::EnsureFrameIsComputed(int32 subsampled_frame) {
   int32 subsampling_factor = opts_.frame_subsampling_factor,
       subsampled_frames_per_chunk = opts_.frames_per_chunk / subsampling_factor,
       start_subsampled_frame = subsampled_frame,
-     num_subsampled_frames = std::min<int32>(num_subsampled_frames_ -
-                                             start_subsampled_frame,
-                                             subsampled_frames_per_chunk),
+      num_subsampled_frames = std::min<int32>(num_subsampled_frames_ -
+                                              start_subsampled_frame,
+                                              subsampled_frames_per_chunk),
       last_subsampled_frame = start_subsampled_frame + num_subsampled_frames - 1;
   KALDI_ASSERT(num_subsampled_frames > 0);
   // the output-frame numbers are the subsampled-frame numbers
@@ -122,8 +122,15 @@ void NnetDecodableBase::EnsureFrameIsComputed(int32 subsampled_frame) {
       last_output_frame = last_subsampled_frame * subsampling_factor;
 
   KALDI_ASSERT(opts_.extra_left_context >= 0 && opts_.extra_right_context >= 0);
-  int32 left_context = nnet_left_context_ + opts_.extra_left_context,
-      right_context = nnet_right_context_ + opts_.extra_right_context;
+  int32 extra_left_context = opts_.extra_left_context,
+      extra_right_context = opts_.extra_right_context;
+  if (first_output_frame == 0 && opts_.extra_left_context_initial >= 0)
+    extra_left_context = opts_.extra_left_context_initial;
+  if (last_subsampled_frame == num_subsampled_frames_ - 1 &&
+      opts_.extra_right_context_final >= 0)
+    extra_right_context = opts_.extra_right_context_final;
+  int32 left_context = nnet_left_context_ + extra_left_context,
+      right_context = nnet_right_context_ + extra_right_context;
   int32 first_input_frame = first_output_frame - left_context,
       last_input_frame = last_output_frame + right_context,
       num_input_frames = last_input_frame + 1 - first_input_frame;
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 7faa0755e74..45652c8e4ba 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -38,6 +38,8 @@ namespace nnet3 {
 struct NnetSimpleComputationOptions {
   int32 extra_left_context;
   int32 extra_right_context;
+  int32 extra_left_context_initial;
+  int32 extra_right_context_final;
   int32 frame_subsampling_factor;
   int32 frames_per_chunk;
   BaseFloat acoustic_scale;
@@ -47,7 +49,9 @@ struct NnetSimpleComputationOptions {
 
   NnetSimpleComputationOptions():
       extra_left_context(0),
-      extra_right_context(0),      
+      extra_right_context(0),
+      extra_left_context_initial(-1),
+      extra_right_context_final(-1),
       frame_subsampling_factor(1),
       frames_per_chunk(50),
       acoustic_scale(0.1),
@@ -58,14 +62,20 @@ struct NnetSimpleComputationOptions {
                    "Number of frames of additional left-context to add on top "
                    "of the neural net's inherent left context (may be useful in "
                    "recurrent setups");
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
-                   "Required if the frame-rate of the output (e.g. in 'chain' "
-                   "models) is less than the frame-rate of the original "
-                   "alignment.");
     opts->Register("extra-right-context", &extra_right_context,
                    "Number of frames of additional right-context to add on top "
                    "of the neural net's inherent right context (may be useful in "
                    "recurrent setups");
+    opts->Register("extra-left-context-initial", &extra_left_context_initial,
+                   "If >0, overrides the --extra-left-context value at the start "
+                   "of an utterance.");
+    opts->Register("extra-right-context-final", &extra_right_context_final,
+                   "If >0, overrides the --extra-right-context value at the end "
+                   "of an utterance.");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
     opts->Register("acoustic-scale", &acoustic_scale,
                    "Scaling factor for acoustic log-likelihoods");
     opts->Register("frames-per-chunk", &frames_per_chunk,
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index c9ea698d48b..810ee2b471a 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -427,7 +427,7 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
                                                 end = egs_.end();
   for (; iter != end; ++iter)
     prob_computer_->Compute(*iter);
-  const SimpleObjectiveInfo *objf_info =
+  const ChainObjectiveInfo *objf_info =
       prob_computer_->GetObjective("output");
   if (objf_info == NULL)
     KALDI_ERR << "Error getting objective info (unsuitable egs?)";
@@ -436,7 +436,7 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
   VectorizeNnet(deriv, nnet_params_deriv);
   // we prefer to deal with normalized objective functions.
   nnet_params_deriv->Scale(1.0 / objf_info->tot_weight);
-  return objf_info->tot_objective / objf_info->tot_weight;
+  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
 }
 
 
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 37ebe85de81..46e2b0c01dc 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -65,9 +65,18 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
   bool need_model_derivative = nnet_config_.compute_deriv,
       store_component_stats = false;
   ComputationRequest request;
+  // if the options specify cross-entropy regularization, we'll be computing
+  // this objective (not interpolated with the regular objective-- we give it a
+  // separate name), but currently we won't make it contribute to the
+  // derivative-- we just compute the derivative of the regular output.
+  // This is because in the place where we use the derivative (the
+  // model-combination code) we decided to keep it simple and just use the
+  // regular objective.
+  bool use_xent_regularization = (chain_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
   GetChainComputationRequest(nnet_, chain_eg, need_model_derivative,
-                             store_component_stats,
-                             &request);
+                             store_component_stats, use_xent_regularization,
+                             use_xent_derivative, &request);
   const NnetComputation *computation = compiler_.Compile(request);
   NnetComputer computer(nnet_config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
@@ -93,19 +102,24 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
       KALDI_ERR << "Network has no output named " << sup.name;
 
     const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
-    CuMatrix<BaseFloat> nnet_output_deriv;
+    bool use_xent = (chain_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
     if (nnet_config_.compute_deriv)
       nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                                kUndefined);
-
-    BaseFloat tot_objf, tot_weight;
-
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+      
+    BaseFloat tot_like, tot_l2_term, tot_weight;
+    
     ComputeChainObjfAndDeriv(chain_config_, den_graph_,
                              sup.supervision, nnet_output,
-                             &tot_objf, &tot_weight,
-                             (nnet_config_.compute_deriv ?
-                              &nnet_output_deriv : NULL));
-
+                             &tot_like, &tot_l2_term, &tot_weight,
+                             (nnet_config_.compute_deriv ? &nnet_output_deriv :
+                              NULL), (use_xent ? &xent_deriv : NULL));
+    
     // note: in this context we don't want to apply 'sup.deriv_weights' because
     // this code is used only in combination, where it's part of an L-BFGS
     // optimization algorithm, and in that case if there is a mismatch between
@@ -114,20 +128,33 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     // and conjugate gradient descent both rely on the derivatives being
     // accurate, and don't fail gracefully if the derivatives are not accurate).
 
-    SimpleObjectiveInfo &totals = objf_info_[sup.name];
+    ChainObjectiveInfo &totals = objf_info_[sup.name];
     totals.tot_weight += tot_weight;
-    totals.tot_objective += tot_objf;
+    totals.tot_like += tot_like;
+    totals.tot_l2_term += tot_l2_term;
 
     if (nnet_config_.compute_deriv)
       computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
 
+    if (use_xent) {
+      ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of '.supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_totals.tot_weight += tot_weight;
+      xent_totals.tot_like += xent_objf;
+    }
     num_minibatches_processed_++;
   }
 }
 
 bool NnetChainComputeProb::PrintTotalStats() const {
   bool ans = false;
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
       iter, end;
   iter = objf_info_.begin();
   end = objf_info_.end();
@@ -135,11 +162,21 @@ bool NnetChainComputeProb::PrintTotalStats() const {
     const std::string &name = iter->first;
     int32 node_index = nnet_.GetNodeIndex(name);
     KALDI_ASSERT(node_index >= 0);
-    const SimpleObjectiveInfo &info = iter->second;
-    KALDI_LOG << "Overall log-probability for '"
-              << name << "' is "
-              << (info.tot_objective / info.tot_weight) << " per frame"
-              << ", over " << info.tot_weight << " frames.";
+    const ChainObjectiveInfo &info = iter->second;
+    BaseFloat like = (info.tot_like / info.tot_weight),
+        l2_term = (info.tot_l2_term / info.tot_weight),
+        tot_objf = like + l2_term;
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " + " << l2_term << " = " << tot_objf << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    }
     if (info.tot_weight > 0)
       ans = true;
   }
@@ -147,9 +184,9 @@ bool NnetChainComputeProb::PrintTotalStats() const {
 }
 
 
-const SimpleObjectiveInfo* NnetChainComputeProb::GetObjective(
+const ChainObjectiveInfo* NnetChainComputeProb::GetObjective(
     const std::string &output_name) const {
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
       iter = objf_info_.find(output_name);
   if (iter != objf_info_.end())
     return &(iter->second);
diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h
index 797bd7c57d1..cb433b1ca4d 100644
--- a/src/nnet3/nnet-chain-diagnostics.h
+++ b/src/nnet3/nnet-chain-diagnostics.h
@@ -33,9 +33,23 @@ namespace kaldi {
 namespace nnet3 {
 
 
+struct ChainObjectiveInfo {
+  double tot_weight;
+  double tot_like;
+  double tot_l2_term;
+  ChainObjectiveInfo(): tot_weight(0.0),
+                        tot_like(0.0),
+                        tot_l2_term(0.0) { }
+};
+
 
-/** This class is for computing objective-function values in a nnet3+chain setup,
-    for diagnostics.  It also supports computing model derivatives.
+/** This class is for computing objective-function values in a nnet3+chain
+    setup, for diagnostics.  It also supports computing model derivatives.
+    Note: if the --xent-regularization option is nonzero, the cross-entropy
+    objective will be computed, and displayed when you call PrintTotalStats(),
+    but it will not contribute to model derivatives (there is no code to compute
+    the regularized objective function, and anyway it's not clear that we really
+    need this regularization in the combination phase).
  */
 class NnetChainComputeProb {
  public:
@@ -56,7 +70,7 @@ class NnetChainComputeProb {
 
   // returns the objective-function info for this output name (e.g. "output"),
   // or NULL if there is no such info.
-  const SimpleObjectiveInfo *GetObjective(const std::string &output_name) const;
+  const ChainObjectiveInfo *GetObjective(const std::string &output_name) const;
 
   // if config.compute_deriv == true, returns a reference to the
   // computed derivative.  Otherwise crashes.
@@ -75,7 +89,7 @@ class NnetChainComputeProb {
   Nnet *deriv_nnet_;
   int32 num_minibatches_processed_;  // this is only for diagnostics
 
-  unordered_map<std::string, SimpleObjectiveInfo, StringHasher> objf_info_;
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher> objf_info_;
 
 };
 
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 8c39829f650..76bc8bbc66b 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -25,49 +25,6 @@ namespace kaldi {
 namespace nnet3 {
 
 
-// writes compressed as unsigned char a vector 'vec' that is required to have
-// values between 0 and 1.
-static inline void WriteVectorAsChar(std::ostream &os,
-                                     bool binary,
-                                     const VectorBase<BaseFloat> &vec) {
-  if (binary) {
-    int32 dim = vec.Dim();
-    std::vector<unsigned char> char_vec(dim);
-    const BaseFloat *data = vec.Data();
-    for (int32 i = 0; i < dim; i++) {
-      BaseFloat value = data[i];
-      KALDI_ASSERT(value >= 0.0 && value <= 1.0);
-      // below, the adding 0.5 is done so that we round to the closest integer
-      // rather than rounding down (since static_cast will round down).
-      char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
-    }
-    WriteIntegerVector(os, binary, char_vec);
-  } else {
-    // the regular floating-point format will be more readable for text mode.
-    vec.Write(os, binary);
-  }
-}
-
-// reads data written by WriteVectorAsChar.
-static inline void ReadVectorAsChar(std::istream &is,
-                                    bool binary,
-                                    Vector<BaseFloat> *vec) {
-  if (binary) {
-    BaseFloat scale = 1.0 / 255.0;
-    std::vector<unsigned char> char_vec;
-    ReadIntegerVector(is, binary, &char_vec);
-    int32 dim = char_vec.size();
-    vec->Resize(dim, kUndefined);
-    BaseFloat *data = vec->Data();
-    for (int32 i = 0; i < dim; i++)
-      data[i] = scale * char_vec[i];
-  } else {
-    vec->Read(is, binary);
-  }
-}
-
-
-
 void NnetChainSupervision::Write(std::ostream &os, bool binary) const {
   CheckDim();
   WriteToken(os, binary, "<NnetChainSup>");
@@ -359,11 +316,13 @@ void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
                                 bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
                                 ComputationRequest *request) {
   request->inputs.clear();
   request->inputs.reserve(eg.inputs.size());
   request->outputs.clear();
-  request->outputs.reserve(eg.outputs.size());
+  request->outputs.reserve(eg.outputs.size() * 2);
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
   for (size_t i = 0; i < eg.inputs.size(); i++) {
@@ -395,6 +354,19 @@ void GetChainComputationRequest(const Nnet &nnet,
     io_spec.name = name;
     io_spec.indexes = sup.indexes;
     io_spec.has_deriv = need_model_derivative;
+
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
   }
   // check to see if something went wrong.
   if (request->inputs.empty())
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 705e6f818f4..323e73da8da 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -177,11 +177,21 @@ void TruncateDerivWeights(int32 truncate,
      can create the ComputationRequest manually.  Assumes that if
      need_model_derivative is true, you will be supplying derivatives w.r.t. all
      outputs.
+
+     If use_xent_regularization == true, then it assumes that for each output
+     name (e.g. "output" in the eg, there is another output with the same
+     dimension and with the suffix "-xent" on its name, e.g. named
+     "output-xent".  The derivative w.r.t. the xent objective will only be
+     supplied to the nnet computation if 'use_xent_derivative' is true (we
+     propagate back the xent derivative to the model only in training, not in
+     model-combination in nnet3-chain-combine).
 */
 void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
                                 bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
                                 ComputationRequest *computation_request);
 
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 781fc96417b..dee0eee2a33 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -50,9 +50,11 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
 void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   bool need_model_derivative = true;
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
   ComputationRequest request;
   GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative,
                              nnet_config.store_component_stats,
+                             use_xent_regularization, need_model_derivative,
                              &request);
   const NnetComputation *computation = compiler_.Compile(request);
 
@@ -108,23 +110,50 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
                                           nnet_output.NumCols(),
                                           kUndefined);
 
-    BaseFloat tot_objf, tot_weight;
+    bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    BaseFloat tot_objf, tot_l2_term, tot_weight;
 
     ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
                              sup.supervision, nnet_output,
-                             &tot_objf, &tot_weight,
-                             &nnet_output_deriv);
+                             &tot_objf, &tot_l2_term, &tot_weight,
+                             &nnet_output_deriv,
+                             (use_xent ? &xent_deriv : NULL));
+
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_,
+                                        tot_weight, xent_objf);
+    }
 
     if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
       CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
       nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
     computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
-                                     tot_weight, tot_objf);
+                                     tot_weight, tot_objf, tot_l2_term);
+
+    if (use_xent) {
+      xent_deriv.Scale(opts_.chain_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
   }
 }
 
@@ -137,7 +166,7 @@ bool NnetChainTrainer::PrintTotalStats() const {
   for (; iter != end; ++iter) {
     const std::string &name = iter->first;
     const ObjectiveFunctionInfo &info = iter->second;
-    ans = ans || info.PrintTotalStats(name);
+    ans = info.PrintTotalStats(name) || ans;
   }
   return ans;
 }
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 11a0e0cfd6d..0b651372fe1 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -43,7 +43,7 @@ struct NnetChainTrainingOptions {
     chain_config.Register(opts);
     opts->Register("apply-deriv-weights", &apply_deriv_weights,
                    "If true, apply the per-frame derivative weights stored with "
-                   "the example (you'll normally want to leave this as true.");
+                   "the example");
   }
 };
 
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index a6bef58e6fc..298548857dd 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -34,7 +34,6 @@ namespace nnet3 {
 struct SimpleObjectiveInfo {
   double tot_weight;
   double tot_objective;
-
   SimpleObjectiveInfo(): tot_weight(0.0),
                          tot_objective(0.0) { }
 
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
new file mode 100644
index 00000000000..381bec09206
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -0,0 +1,213 @@
+// nnet3/nnet-discriminative-diagnostics.cc
+
+// Copyright  2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright  2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
+    const NnetComputeProbOptions &nnet_config,
+    const discriminative::DiscriminativeTrainingOptions &discriminative_training_config,
+    const TransitionModel &tmodel,
+    const VectorBase<BaseFloat> &priors,
+    const Nnet &nnet):
+    nnet_config_(nnet_config),
+    discriminative_training_config_(discriminative_training_config),
+    tmodel_(tmodel),
+    log_priors_(priors),
+    nnet_(nnet),
+    compiler_(nnet, nnet_config_.optimize_config),
+    deriv_nnet_(NULL),
+    num_minibatches_processed_(0) {
+  log_priors_.ApplyLog();
+  if (nnet_config_.compute_deriv) {
+    deriv_nnet_ = new Nnet(nnet_);
+    bool is_gradient = true;  // force simple update
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+const Nnet& NnetDiscriminativeComputeObjf::GetDeriv() const {
+  if (deriv_nnet_ == NULL)
+    KALDI_ERR << "GetDeriv() called when no derivatives were requested.";
+  return *deriv_nnet_;
+}
+
+NnetDiscriminativeComputeObjf::~NnetDiscriminativeComputeObjf() {
+  delete deriv_nnet_;  // delete does nothing if pointer is NULL.
+}
+
+void NnetDiscriminativeComputeObjf::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  if (deriv_nnet_) {
+    bool is_gradient = true;
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = nnet_config_.compute_deriv,
+      store_component_stats = false;
+  bool use_xent_regularization = (discriminative_training_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
+
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(nnet_, eg, 
+                                      need_model_derivative,
+                                      store_component_stats,
+                                      use_xent_regularization, use_xent_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+  NnetComputer computer(nnet_config_.compute_config, *computation,
+                        nnet_, deriv_nnet_);
+  // give the inputs to the computer object.
+  computer.AcceptInputs(nnet_, eg.inputs);
+  computer.Forward();
+  this->ProcessOutputs(eg, &computer);
+  if (nnet_config_.compute_deriv)
+    computer.Backward();
+}
+
+void NnetDiscriminativeComputeObjf::ProcessOutputs(const NnetDiscriminativeExample &eg,
+                                         NnetComputer *computer) {
+  // There will normally be just one output here, named 'output',
+  // but the code is more general than this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_.GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_.IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+    
+    bool use_xent = (discriminative_training_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
+
+    if (nnet_config_.compute_deriv)
+      nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+    
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    discriminative::DiscriminativeTrainingStats stats;
+    BaseFloat tot_l2_term;
+
+    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_training_config_, 
+                                                      tmodel_, log_priors_,
+                                                      sup.supervision, nnet_output,
+                                                      &stats, &tot_l2_term,
+                                                      (nnet_config_.compute_deriv ?
+                                                       &nnet_output_deriv : NULL),
+                                                      (use_xent ? &xent_deriv : NULL));
+
+    DiscriminativeObjectiveInfo &totals = objf_info_[sup.name];
+    totals.stats.Add(stats);
+    totals.tot_l2_term += tot_l2_term;
+
+    // note: in this context we don't want to apply 'sup.deriv_weights' because
+    // this code is used only in combination, where it's part of an L-BFGS
+    // optimization algorithm, and in that case if there is a mismatch between
+    // the computed objective function and the derivatives, it may cause errors
+    // in the optimization procedure such as early termination.  (line search
+    // and conjugate gradient descent both rely on the derivatives being
+    // accurate, and don't fail gracefully if the derivatives are not accurate).
+    
+    if (nnet_config_.compute_deriv)
+      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    
+    if (use_xent) {
+      DiscriminativeObjectiveInfo &xent_totals = objf_info_[xent_name];
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of '.supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_totals.stats.tot_t_weighted += stats.TotalT();
+      xent_totals.stats.tot_objf += xent_objf;
+    }
+    
+    num_minibatches_processed_++;
+  }
+}
+
+bool NnetDiscriminativeComputeObjf::PrintTotalStats() const {
+  bool ans = false;
+  unordered_map<std::string, DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter, end;
+  iter = objf_info_.begin();
+  end = objf_info_.end();
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    int32 node_index = nnet_.GetNodeIndex(name);
+    KALDI_ASSERT(node_index >= 0);
+    const DiscriminativeObjectiveInfo &info = iter->second;
+    BaseFloat tot_weight = info.stats.TotalT();
+    BaseFloat tot_objective = info.stats.TotalObjf(discriminative_training_config_.criterion);
+    
+    info.stats.PrintAll(discriminative_training_config_.criterion);
+
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall " << discriminative_training_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall " << discriminative_training_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " + " << (info.tot_l2_term / tot_weight)
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    }
+
+    if (tot_weight > 0)
+      ans = true;
+  }
+  return ans;
+}
+
+const DiscriminativeObjectiveInfo* NnetDiscriminativeComputeObjf::GetObjective(
+    const std::string &output_name) const {
+  unordered_map<std::string, DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter = objf_info_.find(output_name);
+  if (iter != objf_info_.end())
+    return &(iter->second);
+  else
+    return NULL;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-diagnostics.h b/src/nnet3/nnet-discriminative-diagnostics.h
new file mode 100644
index 00000000000..fb96570f851
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.h
@@ -0,0 +1,94 @@
+// nnet3/nnet-discriminative-diagnostics.h
+
+// Copyright    2012-2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright    2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+struct DiscriminativeObjectiveInfo {
+  double tot_l2_term;
+  discriminative::DiscriminativeTrainingStats stats;
+  DiscriminativeObjectiveInfo() : tot_l2_term(0.0) { }
+};
+
+/** This class is for computing objective-function values in a nnet3 
+    discriminative training, for diagnostics.  It also supports computing model derivatives.
+ */
+class NnetDiscriminativeComputeObjf {
+ public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
+  NnetDiscriminativeComputeObjf(const NnetComputeProbOptions &nnet_config,
+                                const discriminative::DiscriminativeTrainingOptions &discriminative_training_config,
+                                const TransitionModel &tmodel,
+                                const VectorBase<BaseFloat> &priors,
+                                const Nnet &nnet);
+
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
+  // compute objective on one minibatch.
+  void Compute(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  // returns the objective-function info for this output name (e.g. "output"),
+  // or NULL if there is no such info.
+  const DiscriminativeObjectiveInfo *GetObjective(const std::string &output_name) const;
+
+  // if config.compute_deriv == true, returns a reference to the
+  // computed derivative.  Otherwise crashes.
+  const Nnet &GetDeriv() const;
+  
+  ~NnetDiscriminativeComputeObjf();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  NnetComputeProbOptions nnet_config_;
+  discriminative::DiscriminativeTrainingOptions discriminative_training_config_;
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  Nnet *deriv_nnet_;
+  int32 num_minibatches_processed_;  // this is only for diagnostics
+
+  unordered_map<std::string, DiscriminativeObjectiveInfo, StringHasher> objf_info_;
+};
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
new file mode 100644
index 00000000000..e9a063e268e
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -0,0 +1,419 @@
+// nnet3/nnet-discriminative-example.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void NnetDiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  CheckDim();
+  WriteToken(os, binary, "<NnetDiscriminativeSup>");
+  WriteToken(os, binary, name);
+  WriteIndexVector(os, binary, indexes);
+  supervision.Write(os, binary);
+  WriteToken(os, binary, "<DW>");  // for DerivWeights.  Want to save space.
+  WriteVectorAsChar(os, binary, deriv_weights);
+  WriteToken(os, binary, "</NnetDiscriminativeSup>");
+}
+
+bool NnetDiscriminativeSupervision::operator == (const NnetDiscriminativeSupervision &other) const {
+  return name == other.name && indexes == other.indexes &&
+      supervision == other.supervision &&
+      deriv_weights.ApproxEqual(other.deriv_weights);
+}
+
+void NnetDiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetDiscriminativeSup>");
+  ReadToken(is, binary, &name);
+  ReadIndexVector(is, binary, &indexes);
+  supervision.Read(is, binary);
+  ExpectToken(is, binary, "<DW>");
+  ReadVectorAsChar(is, binary, &deriv_weights);
+  ExpectToken(is, binary, "</NnetDiscriminativeSup>");
+  CheckDim();
+}
+
+
+void NnetDiscriminativeSupervision::CheckDim() const {
+  if (supervision.frames_per_sequence == -1) {
+    // this object has not been set up.
+    KALDI_ASSERT(indexes.empty());
+    return;
+  }
+  KALDI_ASSERT(indexes.size() == supervision.num_sequences *
+               supervision.frames_per_sequence && !indexes.empty() &&
+               supervision.frames_per_sequence > 1);
+  int32 first_frame = indexes[0].t,
+      frame_skip = indexes[supervision.num_sequences].t - first_frame,
+      num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  int32 k = 0;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      int32 n = j, t = i * frame_skip + first_frame, x = 0;
+      Index index(n, t, x);
+      KALDI_ASSERT(indexes[k] == index);
+    }
+  }
+  if (deriv_weights.Dim() != 0) {
+    KALDI_ASSERT(deriv_weights.Dim() == indexes.size());
+    KALDI_ASSERT(deriv_weights.Min() >= 0.0 &&
+                 deriv_weights.Max() <= 1.0);
+  }
+}
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other):
+    name(other.name),
+    indexes(other.indexes),
+    supervision(other.supervision),
+    deriv_weights(other.deriv_weights) { CheckDim(); }
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(
+    const std::string &name,
+    const discriminative::DiscriminativeSupervision &supervision,
+    const Vector<BaseFloat> &deriv_weights,
+    int32 first_frame,
+    int32 frame_skip):
+    name(name),
+    supervision(supervision),
+    deriv_weights(deriv_weights) {
+  // note: this will set the 'x' index to zero.
+  indexes.resize(supervision.num_sequences *
+                 supervision.frames_per_sequence);
+  int32 k = 0, num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      indexes[k].n = j;
+      indexes[k].t = i * frame_skip + first_frame;
+    }
+  }
+  KALDI_ASSERT(k == indexes.size());
+  CheckDim();
+}
+
+void NnetDiscriminativeSupervision::Swap(NnetDiscriminativeSupervision *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  supervision.Swap(&(other->supervision));
+  deriv_weights.Swap(&(other->deriv_weights));
+  if (RandInt(0, 5) == 0)
+    CheckDim();
+}
+
+
+void NnetDiscriminativeExample::Write(std::ostream &os, bool binary) const {
+  // Note: weight, label, input_frames and spk_info are members.  This is a
+  // struct.
+  WriteToken(os, binary, "<Nnet3DiscriminativeEg>");
+  WriteToken(os, binary, "<NumInputs>");
+  int32 size = inputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no inputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    inputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "<NumOutputs>");
+  size = outputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no outputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    outputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Nnet3DiscriminativeEg>");
+  ExpectToken(is, binary, "<NumInputs>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  inputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    inputs[i].Read(is, binary);
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  outputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    outputs[i].Read(is, binary);
+  ExpectToken(is, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Swap(NnetDiscriminativeExample *other) {
+  inputs.swap(other->inputs);
+  outputs.swap(other->outputs);
+}
+
+void NnetDiscriminativeExample::Compress() {
+  std::vector<NnetIo>::iterator iter = inputs.begin(), end = inputs.end();
+  // calling features.Compress() will do nothing if they are sparse or already
+  // compressed.
+  for (; iter != end; ++iter) iter->features.Compress();
+}
+
+NnetDiscriminativeExample::NnetDiscriminativeExample(const NnetDiscriminativeExample &other):
+    inputs(other.inputs), outputs(other.outputs) { }
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output) {
+  int32 num_inputs = inputs.size(),
+      num_indexes = 0;
+  for (int32 n = 0; n < num_inputs; n++) {
+    KALDI_ASSERT(inputs[n]->name == inputs[0]->name);
+    num_indexes += inputs[n]->indexes.size();
+  }
+  output->name = inputs[0]->name;
+  std::vector<const discriminative::DiscriminativeSupervision*> input_supervision;
+  input_supervision.reserve(inputs.size());
+  for (int32 n = 0; n < num_inputs; n++)
+    input_supervision.push_back(&(inputs[n]->supervision));
+  std::vector<discriminative::DiscriminativeSupervision> output_supervision;
+  bool compactify = true;
+  discriminative::AppendSupervision(input_supervision,
+                         compactify,
+                         &output_supervision);
+  if (output_supervision.size() != 1)
+    KALDI_ERR << "Failed to merge discriminative examples-- inconsistent lengths "
+              << "or weights?";
+  output->supervision.Swap(&(output_supervision[0]));
+
+  output->indexes.clear();
+  output->indexes.reserve(num_indexes);
+  for (int32 n = 0; n < num_inputs; n++) {
+    const std::vector<Index> &src_indexes = inputs[n]->indexes;
+    int32 cur_size = output->indexes.size();
+    output->indexes.insert(output->indexes.end(),
+                           src_indexes.begin(), src_indexes.end());
+    std::vector<Index>::iterator iter = output->indexes.begin() + cur_size,
+        end = output->indexes.end();
+    // change the 'n' index to correspond to the index into 'input'.
+    // Each example gets a different 'n' value, starting from 0.
+    for (; iter != end; ++iter) {
+      KALDI_ASSERT(iter->n == 0 && "Merging already-merged discriminative egs");
+      iter->n = n;
+    }
+  }
+  KALDI_ASSERT(output->indexes.size() == num_indexes);
+  // OK, at this point the 'indexes' will be in the wrong order,
+  // because they should be first sorted by 't' and next by 'n'.
+  // 'sort' will fix this, due to the operator < on type Index.
+  // TODO: Is this required?
+  std::sort(output->indexes.begin(), output->indexes.end());
+
+  // merge the deriv_weights.
+  if (inputs[0]->deriv_weights.Dim() != 0) {
+    int32 frames_per_sequence = inputs[0]->deriv_weights.Dim();
+    output->deriv_weights.Resize(output->indexes.size(), kUndefined);
+    KALDI_ASSERT(output->deriv_weights.Dim() ==
+                 frames_per_sequence * num_inputs);
+    for (int32 n = 0; n < num_inputs; n++) {
+      const Vector<BaseFloat> &src_deriv_weights = inputs[n]->deriv_weights;
+      KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence);
+      // the ordering of the deriv_weights corresponds to the ordering of the
+      // Indexes, where the time dimension has the greater stride.
+      for (int32 t = 0; t < frames_per_sequence; t++) {
+        output->deriv_weights(t * num_inputs + n) = src_deriv_weights(t);
+      }
+    }
+  }
+  output->CheckDim();
+}
+
+
+void MergeDiscriminativeExamples(bool compress,
+                        std::vector<NnetDiscriminativeExample> *input,
+                        NnetDiscriminativeExample *output) {
+  int32 num_examples = input->size();
+  KALDI_ASSERT(num_examples > 0);
+  // we temporarily make the input-features in 'input' look like regular NnetExamples,
+  // so that we can recycle the MergeExamples() function.
+  std::vector<NnetExample> eg_inputs(num_examples);
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  NnetExample eg_output;
+  MergeExamples(eg_inputs, compress, &eg_output);
+  // swap the inputs back so that they are not really changed.
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  // write to 'output->inputs'
+  eg_output.io.swap(output->inputs);
+
+  // Now deal with the discriminative-supervision 'outputs'.  There will
+  // normally be just one of these, with name "output", but we
+  // handle the more general case.
+  int32 num_output_names = (*input)[0].outputs.size();
+  output->outputs.resize(num_output_names);
+  for (int32 i = 0; i < num_output_names; i++) {
+    std::vector<const NnetDiscriminativeSupervision*> to_merge(num_examples);
+    for (int32 j = 0; j < num_examples; j++) {
+      KALDI_ASSERT((*input)[j].outputs.size() == num_output_names);
+      to_merge[j] = &((*input)[j].outputs[i]);
+    }
+    MergeSupervision(to_merge,
+                     &(output->outputs[i]));
+  }
+}
+
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg) {
+  for (size_t i = 0; i < eg->outputs.size(); i++) {
+    NnetDiscriminativeSupervision &supervision = eg->outputs[i];
+    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
+    if (deriv_weights.Dim() == 0) {
+      deriv_weights.Resize(supervision.indexes.size());
+      deriv_weights.Set(1.0);
+    }
+    int32 num_sequences = supervision.supervision.num_sequences,
+       frames_per_sequence = supervision.supervision.frames_per_sequence;
+    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
+    for (int32 t = 0; t < truncate; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+    for (int32 t = frames_per_sequence - truncate;
+         t < frames_per_sequence; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+  }
+}
+
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.inputs.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.outputs.size());
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+  for (size_t i = 0; i < eg.inputs.size(); i++) {
+    const NnetIo &io = eg.inputs[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsInputNode(node_index))
+      KALDI_ERR << "Nnet example has input named '" << name
+                << "', but no such input node is in the network.";
+
+    request->inputs.resize(request->inputs.size() + 1);
+    IoSpecification &io_spec = request->inputs.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = false;
+  }
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    // there will normally be exactly one output , named "output"
+    const NnetDiscriminativeSupervision &sup = eg.outputs[i];
+    const std::string &name = sup.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsOutputNode(node_index))
+      KALDI_ERR << "Nnet example has output named '" << name
+                << "', but no such output node is in the network.";
+    request->outputs.resize(request->outputs.size() + 1);
+    IoSpecification &io_spec = request->outputs.back();
+    io_spec.name = name;
+    io_spec.indexes = sup.indexes;
+    io_spec.has_deriv = need_model_derivative;
+    
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
+  }
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
+
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                            const std::vector<std::string> &exclude_names,
+                            NnetDiscriminativeExample *eg) {
+  std::vector<NnetIo>::iterator input_iter = eg->inputs.begin(),
+      input_end = eg->inputs.end();
+  for (; input_iter != input_end; ++input_iter) {
+    bool must_exclude = false;
+    std::vector<string>::const_iterator exclude_iter = exclude_names.begin(),
+        exclude_end = exclude_names.end();
+    for (; exclude_iter != exclude_end; ++exclude_iter)
+      if (input_iter->name == *exclude_iter)
+        must_exclude = true;
+    if (!must_exclude) {
+      std::vector<Index>::iterator indexes_iter = input_iter->indexes.begin(),
+          indexes_end = input_iter->indexes.end();
+      for (; indexes_iter != indexes_end; ++indexes_iter)
+        indexes_iter->t += frame_shift;
+    }
+  }
+  // note: we'll normally choose a small enough shift that the output-data
+  // shift will be zero after dividing by frame_subsampling_factor
+  // (e.g. frame_subsampling_factor == 3 and shift = 0 or 1.
+  std::vector<NnetDiscriminativeSupervision>::iterator
+      sup_iter = eg->outputs.begin(),
+      sup_end = eg->outputs.end();
+  for (; sup_iter != sup_end; ++sup_iter) {
+    std::vector<Index> &indexes = sup_iter->indexes;
+    KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n &&
+                 indexes[0].x == indexes[1].x);
+    int32 frame_subsampling_factor = indexes[1].t - indexes[0].t;
+    KALDI_ASSERT(frame_subsampling_factor > 0);
+
+    // We need to shift by a multiple of frame_subsampling_factor.
+    // Round to the closest multiple.
+    int32 supervision_frame_shift =
+        frame_subsampling_factor *
+        std::floor(0.5 + (frame_shift * 1.0 / frame_subsampling_factor));
+    if (supervision_frame_shift == 0)
+      continue;
+    std::vector<Index>::iterator indexes_iter = indexes.begin(),
+        indexes_end = indexes.end();
+    for (; indexes_iter != indexes_end; ++indexes_iter)
+      indexes_iter->t += supervision_frame_shift;
+  }
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
new file mode 100644
index 00000000000..7a3074c744a
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -0,0 +1,280 @@
+// nnet3/nnet-discriminative-example.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-computation.h"
+#include "util/table-types.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Glossary: mmi = Maximum Mutual Information,
+//          mpfe = Minimum Phone Frame Error
+//          smbr = State-level Minimum Bayes Risk
+//           nce = Negative Conditional Entropy
+//         esmbr = Extended State-level Minimum Bayes Risk
+//         empfe = Extended Phone Frame Error
+
+// esmbr and empfe use either posteriors or lattices for 
+// numerator lattices, whereas smbr and mpfe use alignments
+
+// This file relates to the creation of examples for discriminative training
+
+struct NnetDiscriminativeSupervision {
+  /// the name of the output in the neural net; in simple setups it
+  /// will just be "output".
+  std::string name;
+  
+  /// The indexes that the output corresponds to.  The size of this vector will
+  /// be equal to supervision.num_sequences * supervision.frames_per_sequence.
+  /// Be careful about the order of these indexes-- it is a little confusing.
+  /// The indexes in the 'index' vector are ordered as: (frame 0 of each sequence);
+  /// (frame 1 of each sequence); and so on.  But in the 'supervision' object,
+  /// the FST contains (sequence 0; sequence 1; ...).  So reordering is needed.
+  /// This is done for efficiency in the denominator computation (it helps memory
+  /// locality), as well as to match the ordering inside the neural net.
+  std::vector<Index> indexes;
+
+  /// The supervision object, containing the numerator and denominator 
+  /// lattices.
+  discriminative::DiscriminativeSupervision supervision;
+
+  /// This is a vector of per-frame weights, required to be between 0 and 1,
+  /// that is applied to the derivative during training (but not during model
+  /// combination, where the derivatives need to agree with the computed objf
+  /// values for the optimization code to work).  The reason for this is to more
+  /// exactly handle edge effects and to ensure that no frames are
+  /// 'double-counted'.  The order of this vector corresponds to the order of
+  /// the 'indexes' (i.e. all the first frames, then all the second frames,
+  /// etc.)
+  /// If this vector is empty it means we're not applying per-frame weights,
+  /// so it's equivalent to a vector of all ones.  This vector is written
+  /// to disk compactly as unsigned char.
+  Vector<BaseFloat> deriv_weights;
+  
+  // Use default assignment operator
+  NnetDiscriminativeSupervision() { }
+
+  /// Initialize the object from an object of type chain::Supervision, and some
+  /// extra information.  Note: you probably want to set 'name' to "output".
+  /// 'first_frame' will often be zero but you can choose (just make it
+  /// consistent with how you numbered your inputs), and 'frame_skip' would be 1
+  /// in a vanilla setup, but we plan to try setups where the output periodicity
+  /// is slower than the input, so in this case it might be 2 or 3.
+  NnetDiscriminativeSupervision(const std::string &name,
+                                const discriminative::DiscriminativeSupervision &supervision,
+                                const Vector<BaseFloat> &deriv_weights,
+                                int32 first_frame,
+                                int32 frame_skip);
+
+  NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary);
+  
+  void Swap(NnetDiscriminativeSupervision *other);
+
+  void CheckDim() const;
+  
+  bool operator == (const NnetDiscriminativeSupervision &other) const;
+};
+
+/// NnetDiscriminativeExample is like NnetExample, but specialized for 
+/// sequence training.
+struct NnetDiscriminativeExample {
+
+  /// 'inputs' contains the input to the network-- normally just it has just one
+  /// element called "input", but there may be others (e.g. one called
+  /// "ivector")...  this depends on the setup.
+  std::vector<NnetIo> inputs;
+
+  /// 'outputs' contains the sequence output supervision.  There will normally
+  /// be just one member with name == "output".
+  std::vector<NnetDiscriminativeSupervision> outputs;
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+
+  void Swap(NnetDiscriminativeExample *other);
+
+  // Compresses the input features (if not compressed)
+  void Compress();
+
+  NnetDiscriminativeExample() { }
+
+  NnetDiscriminativeExample(const NnetDiscriminativeExample &other);
+
+  bool operator == (const NnetDiscriminativeExample &other) const {
+    return inputs == other.inputs && outputs == other.outputs;
+  }
+};
+
+/** 
+  Appends the given vector of examples (which must be non-empty) into 
+  a single output example.
+  Intended to be used when forming minibatches for neural net training. If 
+  'compress' it compresses the output features (recommended to save disk
+  space).
+
+  Note: the input is left as it was at the start, but it is temporarily
+  changed inside the function; this is a trick to allow us to use the
+  MergeExamples() routine while avoiding having to rewrite code.
+*/
+void MergeDiscriminativeExamples(
+    bool compress,
+    std::vector<NnetDiscriminativeExample> *input,
+    NnetDiscriminativeExample *output);
+
+// called from MergeDiscriminativeExamples, this function merges the Supervision
+// objects into one.  Requires (and checks) that they all have the same name.
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output); 
+
+
+/** Shifts the time-index t of everything in the input of "eg" by adding
+    "t_offset" to all "t" values-- but excluding those with names listed in
+    "exclude_names", e.g.  "ivector".  This might be useful if you are doing
+    subsampling of frames at the output, because shifted examples won't be quite
+    equivalent to their non-shifted counterparts.  "exclude_names" is a vector
+    of names of nnet inputs that we avoid shifting the "t" values of-- normally
+    it will contain just the single string "ivector" because we always leave t=0
+    for any ivector.
+
+    Note: input features will be shifted by 'frame_shift', and indexes in the
+    supervision in (eg->output) will be shifted by 'frame_shift' rounded to the
+    closest multiple of the frame subsampling factor (e.g. 3).  The frame
+    subsampling factor is worked out from the time spacing between the indexes
+    in the output.  */
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                                    const std::vector<std::string> &exclude_names,
+                                    NnetDiscriminativeExample *eg);
+
+/**
+   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
+   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
+   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
+   sequence).
+ */
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg);
+
+/**  This function takes a NnetDiscriminativeExample and produces a 
+     ComputationRequest.
+     Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
+     can create the ComputationRequest manually.  Assumes that if
+     need_model_derivative is true, you will be supplying derivatives w.r.t. all
+     outputs.
+*/
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *computation_request);
+
+/**
+   Given a discriminative training example, this function works out posteriors
+   at the pdf level (note: these are "discriminative-training posteriors" that
+   may be positive or negative.  The denominator lattice "den_lat" in the
+   example "eg" should already have had acoustic-rescoring done so that its
+   acoustic probs are up to date, and any acoustic scaling should already have
+   been applied.
+
+   "criterion" may be one of "mmi", "mpfe", "smbr", "nce", "empfe" and "esmbr".  
+   If criterion is "mmi", "drop_frames" means we don't include derivatives for
+   frames where the numerator pdf is not in the denominator lattice.
+
+   if "one_silence_class" is true you can get a newer behavior for MPE/SMBR
+   which will tend to reduce insertions.
+
+   "silence_phones" is a list of silence phones (this is only relevant for mpfe
+   or smbr, if we want to treat silence specially).
+ */
+void ExampleToPdfPost(
+    const TransitionModel &tmodel,
+    const std::vector<int32> &silence_phones,
+    std::string criterion,
+    bool drop_frames,
+    bool one_silence_class,
+    const NnetDiscriminativeExample &eg,
+    Posterior *post);
+
+/**
+   This function is used in code that tests the functionality that we provide
+   here, about splitting and excising nnet examples.  It adds to a "hash
+   function" that is a function of a set of examples; the hash function is of
+   dimension (number of pdf-ids x features dimension).  The hash function
+   consists of the (denominator - numerator) posteriors over pdf-ids, times the
+   average over the context-window (left-context on the left, right-context on
+   the right), of the features.  This is useful because the various
+   manipulations we do are supposed to preserve this, and if there is a bug
+   it will most likely cause the hash function to change.
+
+   This function will resize the matrix if it is empty.
+
+   Any acoustic scaling of the lattice should be done before you call this
+   function.
+   
+   "criterion" may be one of "mmi", "mpfe", "smbr", "nce", "empfe" and "esmbr".  
+
+   You should set drop_frames to true if you are doing MMI with drop-frames
+   == true.  Then it will not compute the hash for frames where the numerator
+   pdf-id is not in the denominator lattice.
+
+   You can set one_silence_class to true for a newer optional behavior that will
+   reduce insertions in the trained model (or false for the traditional
+   behavior).
+
+   The function will also accumulate the total numerator and denominator weights
+   used as num_weight and den_weight, for an additional diagnostic, and the total
+   number of frames, as tot_t.
+*/
+void UpdateHash(
+    const TransitionModel &tmodel,
+    const NnetDiscriminativeExample &eg,
+    std::string criterion,
+    bool drop_frames,
+    bool one_silence_class,
+    Matrix<double> *hash,
+    double *num_weight,
+    double *den_weight,
+    double *tot_t);
+
+
+typedef TableWriter<KaldiObjectHolder<NnetDiscriminativeExample > > NnetDiscriminativeExampleWriter;
+typedef SequentialTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > SequentialNnetDiscriminativeExampleReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > RandomAccessNnetDiscriminativeExampleReader;
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
new file mode 100644
index 00000000000..6f8383bb09a
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -0,0 +1,289 @@
+// nnet3/nnet-discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
+                                   const NnetDiscriminativeTrainingOptions &opts,
+                                   const TransitionModel &tmodel,
+                                   const VectorBase<BaseFloat> &priors,
+                                   Nnet *nnet):
+    opts_(opts), tmodel_(tmodel), log_priors_(priors),
+    nnet_(nnet),
+    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    num_minibatches_processed_(0) {
+  if (opts.nnet_config.zero_component_stats)
+    ZeroComponentStats(nnet);
+  if (opts.nnet_config.momentum == 0.0 &&
+      opts.nnet_config.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
+                 opts.nnet_config.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    SetZero(is_gradient, delta_nnet_);
+  }
+  log_priors_.ApplyLog();
+}
+
+
+void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = true;
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.discriminative_training_config.xent_regularize != 0.0);
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(*nnet_, eg, need_model_derivative,
+                                      nnet_config.store_component_stats,
+                                      use_xent_regularization,
+                                      need_model_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  NnetComputer computer(nnet_config.compute_config, *computation,
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
+  // give the inputs to the computer object.
+  computer.AcceptInputs(*nnet_, eg.inputs);
+  computer.Forward();
+
+  this->ProcessOutputs(eg, &computer);
+  computer.Backward();
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - nnet_config.momentum);
+    if (nnet_config.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > nnet_config.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          SetZero(false, delta_nnet_);
+        } else {
+          scale *= nnet_config.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << nnet_config.max_param_change
+                    << ", scaling by "
+                    << nnet_config.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  }
+}
+
+
+void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &eg,
+                                               NnetComputer *computer) {
+  // normally the eg will have just one output named 'output', but
+  // we don't assume this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_->GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_->IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+
+    CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                          nnet_output.NumCols(),
+                                          kUndefined);
+    
+    bool use_xent = (opts_.discriminative_training_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+
+    discriminative::DiscriminativeTrainingStats stats(opts_.discriminative_training_stats_config);
+    
+    if (objf_info_.count(sup.name) == 0)
+      objf_info_[sup.name].stats.SetConfig(opts_.discriminative_training_stats_config);
+
+    BaseFloat tot_l2_term = 0.0;
+
+    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_training_config, 
+                                      tmodel_, log_priors_,
+                                      sup.supervision, nnet_output,
+                                      &stats, &tot_l2_term,
+                                      &nnet_output_deriv,
+                                      (use_xent ? &xent_deriv : NULL));
+    
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      if (xent_objf != xent_objf) {
+        BaseFloat default_objf = -10;
+        xent_objf = default_objf;
+      }
+
+      objf_info_[xent_name].UpdateStats(xent_name, "xent",
+                                        opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_,
+                                        stats.TotalT(), xent_objf);
+    }
+
+    if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+      CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
+      nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
+    }
+
+    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+
+    objf_info_[sup.name].UpdateStats(sup.name, opts_.discriminative_training_config.criterion,
+                                     opts_.nnet_config.print_interval,
+                                     num_minibatches_processed_++,
+                                     stats);
+    
+    if (use_xent) {
+      xent_deriv.Scale(opts_.discriminative_training_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
+  }
+}
+
+
+bool NnetDiscriminativeTrainer::PrintTotalStats() const {
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo>::const_iterator
+      iter = objf_info_.begin(),
+      end = objf_info_.end();
+  bool ans = false;
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    const DiscriminativeObjectiveFunctionInfo &info = iter->second;
+    bool ret = info.PrintTotalStats(name, opts_.discriminative_training_config.criterion);
+    ans = ans || ret;
+  }
+
+  return ans;
+}
+
+
+void DiscriminativeObjectiveFunctionInfo::UpdateStats(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase,
+    int32 minibatch_counter,
+    BaseFloat this_minibatch_weight,
+    BaseFloat this_minibatch_tot_objf,
+    BaseFloat this_minibatch_tot_aux_objf) {
+  int32 phase = minibatch_counter / minibatches_per_phase;
+  if (phase != current_phase) {
+    KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
+    PrintStatsForThisPhase(output_name, criterion, minibatches_per_phase);
+    current_phase = phase;
+    stats_this_phase.Reset();
+    tot_aux_objf_this_phase = 0.0;
+  }
+  stats_this_phase.tot_t_weighted += this_minibatch_weight;
+  stats_this_phase.tot_objf += this_minibatch_tot_objf;
+  tot_aux_objf_this_phase += this_minibatch_tot_aux_objf;
+
+  stats.Add(stats_this_phase);
+  tot_aux_objf += this_minibatch_tot_aux_objf;
+}
+
+void DiscriminativeObjectiveFunctionInfo::UpdateStats(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase,
+    int32 minibatch_counter,
+    discriminative::DiscriminativeTrainingStats this_minibatch_stats,
+    BaseFloat this_minibatch_tot_aux_objf) {
+  int32 phase = minibatch_counter / minibatches_per_phase;
+  if (phase != current_phase) {
+    KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
+    PrintStatsForThisPhase(output_name, criterion, minibatches_per_phase);
+    current_phase = phase;
+    stats_this_phase.Reset();
+    tot_aux_objf_this_phase = 0.0;
+  }
+  stats_this_phase.Add(this_minibatch_stats);
+  tot_aux_objf_this_phase += this_minibatch_tot_aux_objf;
+
+  stats.Add(stats_this_phase);
+  tot_aux_objf += this_minibatch_tot_aux_objf;
+}
+void DiscriminativeObjectiveFunctionInfo::PrintStatsForThisPhase(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase) const {
+  int32 start_minibatch = current_phase * minibatches_per_phase,
+      end_minibatch = start_minibatch + minibatches_per_phase - 1;
+
+  if (tot_aux_objf_this_phase == 0.0) {
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << (stats_this_phase.TotalObjf(criterion) / stats_this_phase.TotalT()) << " over "
+              << stats_this_phase.TotalT() << " frames.";
+  } else {
+    BaseFloat objf = (stats_this_phase.TotalObjf(criterion) / stats_this_phase.TotalT()),
+        aux_objf = (tot_aux_objf_this_phase / stats_this_phase.TotalT());
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << objf << " + " << aux_objf << " = " 
+              << " over " << stats_this_phase.TotalT() << " frames.";
+  }
+}
+
+bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &name,
+                const std::string &criterion) const {
+  BaseFloat objf = stats.TotalObjf(criterion) /stats.TotalT(),
+        aux_objf = (tot_aux_objf / stats.TotalT());
+  if (tot_aux_objf == 0.0) {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << objf << " over " << stats.TotalT() << " frames.";
+  } else {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << objf << " + " << aux_objf << " = " 
+              << " over " << stats.TotalT() << " frames.";
+  }
+  KALDI_LOG << "[this line is to be parsed by a script:] "
+            << criterion << "-per-frame="
+            << objf;
+  return (stats.TotalT() != 0.0);
+}
+
+
+NnetDiscriminativeTrainer::~NnetDiscriminativeTrainer() {
+  delete delta_nnet_;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-training.h b/src/nnet3/nnet-discriminative-training.h
new file mode 100644
index 00000000000..55caf90317a
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.h
@@ -0,0 +1,146 @@
+// nnet3/nnet-discriminative-training.h
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//           2014-2015   Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-training.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+struct NnetDiscriminativeTrainingOptions {
+  NnetTrainerOptions nnet_config;
+  discriminative::DiscriminativeTrainingOptions discriminative_training_config;
+  discriminative::DiscriminativeTrainingStatsOptions discriminative_training_stats_config;
+
+  bool apply_deriv_weights;
+
+  NnetDiscriminativeTrainingOptions(): apply_deriv_weights(true) { }
+
+  void Register(OptionsItf *opts) {
+    nnet_config.Register(opts);
+    discriminative_training_config.Register(opts);
+    discriminative_training_stats_config.Register(opts);
+    opts->Register("apply-deriv-weights", &apply_deriv_weights,
+                   "If true, apply the per-frame derivative weights stored with "
+                   "the example (you'll normally want to leave this as true.");
+  }
+};
+
+// This struct is used in multiple nnet training classes for keeping
+// track of objective function values.
+// Also see struct AccuracyInfo, in nnet-diagnostics.h.
+struct DiscriminativeObjectiveFunctionInfo {
+  int32 current_phase;
+
+  double tot_aux_objf;
+  discriminative::DiscriminativeTrainingStats stats;
+  
+  double tot_aux_objf_this_phase;
+  discriminative::DiscriminativeTrainingStats stats_this_phase;
+
+  DiscriminativeObjectiveFunctionInfo():
+      current_phase(0), tot_aux_objf(0.0),
+      tot_aux_objf_this_phase(0.0) { }
+
+  // This function updates the stats and, if the phase has just changed,
+  // prints a message indicating progress.  The phase equals
+  // minibatch_counter / minibatches_per_phase.  Its only function is to
+  // control how frequently we print logging messages.
+  void UpdateStats(const std::string &output_name,
+                   const std::string &criterion,
+                   int32 minibatches_per_phase,
+                   int32 minibatch_counter,
+                   BaseFloat this_minibatch_weight,
+                   BaseFloat this_minibatch_tot_objf,
+                   BaseFloat this_minibatch_tot_aux_objf = 0.0);
+  
+  void UpdateStats(const std::string &output_name,
+                   const std::string &criterion,
+                   int32 minibatches_per_phase,
+                   int32 minibatch_counter,
+                   discriminative::DiscriminativeTrainingStats stats,
+                   BaseFloat this_minibatch_tot_aux_objf = 0.0);
+
+  // Prints stats for the current phase.
+  void PrintStatsForThisPhase(const std::string &output_name,
+                              const std::string &criterion,
+                              int32 minibatches_per_phase) const;
+  // Prints total stats, and returns true if total stats' weight was nonzero.
+  bool PrintTotalStats(const std::string &output_name,
+                       const std::string &criterion) const;
+};
+
+
+/**
+   This class is for single-threaded discriminative training of neural nets 
+*/
+class NnetDiscriminativeTrainer {
+ public:
+  NnetDiscriminativeTrainer(const NnetDiscriminativeTrainingOptions &config,
+                            const TransitionModel &tmodel,
+                            const VectorBase<BaseFloat> &priors,
+                            Nnet *nnet);
+
+  // train on one minibatch.
+  void Train(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  ~NnetDiscriminativeTrainer();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  const NnetDiscriminativeTrainingOptions opts_;
+
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  
+  Nnet *nnet_;
+
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0.  nnet representing
+                      // accumulated parameter-change (we'd call this
+                      // gradient_nnet_, but due to natural-gradient update,
+                      // it's better to consider it as a delta-parameter nnet.
+  CachingOptimizingCompiler compiler_;
+
+  int32 num_minibatches_processed_;
+
+  // This code supports multiple output layers, even though in the
+  // normal case there will be just one output layer named "output".
+  // So we store the objective functions per output layer.
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo, StringHasher> objf_info_;
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 99d41fb06c4..1f4d60f3960 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -219,5 +219,191 @@ void GetComputationRequest(const Nnet &nnet,
     KALDI_ERR << "No outputs in computation request.";
 }
 
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec) {
+  if (binary) {
+    int32 dim = vec.Dim();
+    std::vector<unsigned char> char_vec(dim);
+    const BaseFloat *data = vec.Data();
+    for (int32 i = 0; i < dim; i++) {
+      BaseFloat value = data[i];
+      KALDI_ASSERT(value >= 0.0 && value <= 1.0);
+      // below, the adding 0.5 is done so that we round to the closest integer
+      // rather than rounding down (since static_cast will round down).
+      char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
+    }
+    WriteIntegerVector(os, binary, char_vec);
+  } else {
+    // the regular floating-point format will be more readable for text mode.
+    vec.Write(os, binary);
+  }
+}
+
+void ReadVectorAsChar(std::istream &is,
+                      bool binary,
+                      Vector<BaseFloat> *vec) {
+  if (binary) {
+    BaseFloat scale = 1.0 / 255.0;
+    std::vector<unsigned char> char_vec;
+    ReadIntegerVector(is, binary, &char_vec);
+    int32 dim = char_vec.size();
+    vec->Resize(dim, kUndefined);
+    BaseFloat *data = vec->Data();
+    for (int32 i = 0; i < dim; i++)
+      data[i] = scale * char_vec[i];
+  } else {
+    vec->Read(is, binary);
+  }
+}
+
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap) {
+  if (*num_frames % frame_subsampling_factor != 0) {
+    int32 new_num_frames = frame_subsampling_factor *
+        (*num_frames / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames=" << new_num_frames;
+    *num_frames = new_num_frames;
+  }
+  if (*num_frames_overlap % frame_subsampling_factor != 0) {
+    int32 new_num_frames_overlap = frame_subsampling_factor *
+        (*num_frames_overlap / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames-overlap=" << new_num_frames_overlap;
+    *num_frames_overlap = new_num_frames_overlap;
+  }
+  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
+    KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
+              << "--num-frames=" << (*num_frames);
+  }
+
+}
+
+void SplitIntoRanges(int32 num_frames,
+                     int32 frames_per_range,
+                     std::vector<int32> *range_starts) {
+  if (frames_per_range > num_frames) {
+    range_starts->clear();
+    return;  // there is no room for even one range.
+  }
+  int32 num_ranges = num_frames  / frames_per_range,
+      extra_frames = num_frames % frames_per_range;
+  // this is a kind of heuristic.  If the number of frames we'd
+  // be skipping is less than 1/4 of the frames_per_range, then
+  // skip frames; otherwise, duplicate frames.
+  // it's important that this is <=, not <, so that if
+  // extra_frames == 0 and frames_per_range is < 4, we
+  // don't insert an extra range.
+  if (extra_frames <= frames_per_range / 4) {
+    // skip frames.  we do this at start or end, or between ranges.
+    std::vector<int32> num_skips(num_ranges + 1, 0);
+    for (int32 i = 0; i < extra_frames; i++)
+      num_skips[RandInt(0, num_ranges)]++;
+    range_starts->resize(num_ranges);
+    int32 cur_start = num_skips[0];
+    for (int32 i = 0; i < num_ranges; i++) {
+      (*range_starts)[i] = cur_start;
+      cur_start += frames_per_range;
+      cur_start += num_skips[i + 1];
+    }
+    KALDI_ASSERT(cur_start == num_frames);
+  } else {
+    // duplicate frames.
+    num_ranges++;
+    int32 num_duplicated_frames = frames_per_range - extra_frames;
+    // the way we handle the 'extra_frames' frames of output is that we
+    // backtrack zero or more frames between outputting each pair of ranges, and
+    // the total of these backtracks equals 'extra_frames'.
+    std::vector<int32> num_backtracks(num_ranges, 0);
+    for (int32 i = 0; i < num_duplicated_frames; i++) {
+      // num_ranges - 2 below is not a bug.  we only want to backtrack
+      // between ranges, not past the end of the last range (i.e. at
+      // position num_ranges - 1).  we make the vector one longer to
+      // simplify the loop below.
+      num_backtracks[RandInt(0, num_ranges - 2)]++;
+    }
+    range_starts->resize(num_ranges);
+    int32 cur_start = 0;
+    for (int32 i = 0; i < num_ranges; i++) {
+      (*range_starts)[i] = cur_start;
+      cur_start += frames_per_range;
+      cur_start -= num_backtracks[i];
+    }
+    KALDI_ASSERT(cur_start == num_frames);
+  }
+}
+
+void GetWeightsForRanges(int32 range_length,
+                         const std::vector<int32> &range_starts,
+                         std::vector<Vector<BaseFloat> > *weights) {
+  KALDI_ASSERT(range_length > 0);
+  int32 num_ranges = range_starts.size();
+  weights->resize(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    (*weights)[i].Resize(range_length);
+    (*weights)[i].Set(1.0);
+  }
+  for (int32 i = 0; i + 1 < num_ranges; i++) {
+    int32 j = i + 1;
+    int32 i_start = range_starts[i], i_end = i_start + range_length,
+          j_start = range_starts[j];
+    KALDI_ASSERT(j_start > i_start);
+    if (i_end > j_start) {
+      Vector<BaseFloat> &i_weights = (*weights)[i], &j_weights = (*weights)[j];
+
+      int32 overlap_length = i_end - j_start;
+      // divide the overlapping piece of the 2 ranges into 3 regions of
+      // approximately equal size, called the left, middle and right region.
+      int32 left_length = overlap_length / 3,
+          middle_length = (overlap_length - left_length) / 2,
+           right_length = overlap_length - left_length - middle_length;
+      KALDI_ASSERT(left_length >= 0 && middle_length >= 0 && right_length >= 0 &&
+                   left_length + middle_length + right_length == overlap_length);
+      // set the weight of the left region to be zero for the right (j) range.
+      for (int32 k = 0; k < left_length; k++)
+        j_weights(k) = 0.0;
+      // set the weight of the right region to be zero for the left (i) range.
+      for (int32 k = 0; k < right_length; k++)
+        i_weights(range_length - 1 - k) = 0.0;
+      // for the middle range, linearly interpolate between the 0's and 1's.
+      // note: we multiply with existing weights instead of set in order to get
+      // more accurate behavior in the unexpected case where things triply
+      // overlap.
+      for (int32 k = 0; k < middle_length; k++) {
+        BaseFloat weight = (0.5 + k) / middle_length;
+        j_weights(left_length + k) = weight;
+        i_weights(range_length - 1 - right_length - k) = weight;
+      }
+    }
+  }
+}
+
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,                            
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights) {
+  KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length);
+  int32 num_ranges = range_starts.size();
+  weights->resize(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    (*weights)[i].Resize(range_length);
+    (*weights)[i].Set(1.0);
+  }
+  if (num_frames_zeroed == 0)
+    return;
+  for (int32 i = 1; i < num_ranges; i++)
+    (*weights)[i].Range(0, num_frames_zeroed).Set(0.0);
+  for (int32 i = 0; i + 1 < num_ranges; i++)
+    (*weights)[i].Range(range_length - num_frames_zeroed,
+                        num_frames_zeroed).Set(0.0);
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index d54c3296dac..1675788c2d4 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -63,6 +63,63 @@ void GetComputationRequest(const Nnet &nnet,
                            ComputationRequest *computation_request);
 
 
+// writes compressed as unsigned char a vector 'vec' that is required to have
+// values between 0 and 1.
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec);
+
+// reads data written by WriteVectorAsChar.
+void ReadVectorAsChar(std::istream &is,
+                             bool binary,
+                             Vector<BaseFloat> *vec);
+
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap);
+
+/// This function helps you to pseudo-randomly split a sequence of length 'num_frames',
+/// interpreted as frames 0 ... num_frames - 1, into pieces of length exactly
+/// 'frames_per_range', to be used as examples for training.  Because frames_per_range
+/// may not exactly divide 'num_frames', this function will leave either small gaps or
+/// small overlaps in pseudo-random places.
+/// The output 'range_starts' will be set to a list of the starts of ranges, the
+/// output ranges are of the form
+/// [ (*range_starts)[i] ... (*range_starts)[i] + frames_per_range - 1 ].
+void SplitIntoRanges(int32 num_frames,
+                     int32 frames_per_range,
+                     std::vector<int32> *range_starts);
+
+
+/// This utility function is not used directly in the 'chain' code.  It is used
+/// to get weights for the derivatives, so that we don't doubly train on some
+/// frames after splitting them up into overlapping ranges of frames.  The input
+/// 'range_starts' will be obtained from 'SplitIntoRanges', but the
+/// 'range_length', which is a length in frames, may be longer than the one
+/// supplied to SplitIntoRanges, due the 'overlap'.  (see the calling code...
+/// if we want overlapping ranges, we get it by 'faking' the input to
+/// SplitIntoRanges).
+///
+/// The output vector 'weights' will be given the same dimension as
+/// 'range_starts'.  By default the output weights in '*weights' will be vectors
+/// of all ones, of length equal to 'range_length', and '(*weights)[i]' represents
+/// the weights given to frames numbered
+///   t = range_starts[i] ... range_starts[i] + range_length - 1.
+/// If these ranges for two successive 'i' values overlap, then we
+/// reduce the weights to ensure that no 't' value gets a total weight
+/// greater than 1.  We do this by dividing the overlapped region
+/// into three approximately equal parts, and giving the left part
+/// to the left range; the right part to the right range; and
+/// in between, interpolating linearly.
+void GetWeightsForRanges(int32 range_length,
+                         const std::vector<int32> &range_starts,
+                         std::vector<Vector<BaseFloat> > *weights);
+
+void GetWeightsForRangesNew(int32 range_length,
+                         int32 num_frmaes_zeroed,
+                         const std::vector<int32> &range_starts,
+                         std::vector<Vector<BaseFloat> > *weights);
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index e6c6baf3e1e..8dea02b8918 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+//                2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -139,6 +139,12 @@ Component *Nnet::GetComponent(int32 c) {
   return components_[c];
 }
 
+void Nnet::SetComponent(int32 c, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
+  delete components_[c];
+  components_[c] = component;
+}
+
 /// Returns true if this is component-input node, i.e. a node of type kDescriptor
 /// that immediately precedes a node of type kComponent.
 bool Nnet::IsComponentInputNode(int32 node) const {
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 89c0d4810dd..a48fbb26f88 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.h
 
 // Copyright   2012-2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -130,6 +130,9 @@ class Nnet {
   /// caller.
   const Component *GetComponent(int32 c) const;
 
+  /// Replace the component indexed by c with a new component.
+  /// Frees previous component indexed by c.
+  void SetComponent(int32 c, Component *component);
 
   /// returns const reference to a particular numbered network node.
   const NetworkNode &GetNode(int32 node) const {
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index c66d0235a93..86f4739e30f 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1369,6 +1369,25 @@ BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
   bias_params_(other.bias_params_),
   num_blocks_(other.num_blocks_) {}
 
+BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) :
+  UpdatableComponent(rac),
+  linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(),
+                 rac.linear_params_.NumCols(), kUndefined),
+  bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined),
+  num_blocks_(rac.num_repeats_) {
+  // copy rac's linear_params_ and bias_params_ to this.
+  int32 num_rows_in_block = rac.linear_params_.NumRows();
+  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    int32 row_offset = block_counter * num_rows_in_block;
+    CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
+                                                                 num_rows_in_block);
+    block.CopyFromMat(rac.linear_params_);
+    CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
+                                                                 num_rows_in_block);
+    block_bias.CopyFromVec(rac.bias_params_);
+  }
+}
+
 Component* BlockAffineComponent::Copy() const {
   BlockAffineComponent *ans = new BlockAffineComponent(*this);
   return ans;
@@ -4826,6 +4845,15 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
                 << "(or undefined or bad component type [type=xxx]), in "
                 << "CompositeComponent config line '" << cfl->WholeLine() << "'";
     }
+    if(this_component->Type() == "CompositeComponent") {
+      DeletePointers(&components);
+      delete this_component;
+      KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
+                << "Try decreasing max-rows-process instead."
+                << "Nested line: '" << nested_line.WholeLine() << "'\n"
+                << "Toplevel CompositeComponent line '" << cfl->WholeLine()
+                << "'";
+    }
     this_component->InitFromConfig(&nested_line);
     components.push_back(this_component);
   }
@@ -4835,7 +4863,16 @@ void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
   this->Init(components, max_rows_process);
 }
 
+const Component* CompositeComponent::GetComponent(int32 i) const {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  return components_[i];
+}
 
+void CompositeComponent::SetComponent(int32 i, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  delete components_[i];
+  components_[i] = component;
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index a78f72c0afb..388f2666740 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -389,6 +389,79 @@ class AffineComponent: public UpdatableComponent {
   CuVector<BaseFloat> bias_params_;
 };
 
+class RepeatedAffineComponent;
+
+/// This class implements an affine transform using a block diagonal matrix
+/// e.g., one whose weight matrix is all zeros except for blocks on the
+/// diagonal. All these blocks have the same dimensions.
+///  input-dim: num cols of block diagonal matrix.
+///  output-dim: num rows of block diagonal matrix.
+/// num-blocks: number of blocks in diagonal of the matrix.
+/// num-blocks must divide both input-dim and output-dim
+class BlockAffineComponent : public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
+  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  BlockAffineComponent() { }
+  virtual std::string Type() const { return "BlockAffineComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
+      kBackpropNeedsInput|kBackpropAdds;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // BlockAffine-specific functions.
+  void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
+            BaseFloat param_stddev, BaseFloat bias_mean,
+            BaseFloat bias_stddev);
+  explicit BlockAffineComponent(const BlockAffineComponent &other);
+  explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
+ protected:
+  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
+  // equal size.  The blocks are stored in linear_params_ as
+  // [ M
+  //   N
+  //   O ] but we actually treat it as the matrix:
+  // [ M 0 0
+  //   0 N 0
+  //   0 0 O ]
+  CuMatrix<BaseFloat> linear_params_;
+  CuVector<BaseFloat> bias_params_;
+  int32 num_blocks_;
+ private:
+  const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
+};
+
 class RepeatedAffineComponent: public UpdatableComponent {
  public:
 
@@ -438,7 +511,7 @@ class RepeatedAffineComponent: public UpdatableComponent {
   void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
             BaseFloat param_stddev, BaseFloat bias_mean,
             BaseFloat bias_stddev);
-
+  friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
  protected:
   // This function Update(), called from backprop, is broken out for
   // extensibility to natural gradient update.
@@ -492,77 +565,6 @@ class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
   OnlineNaturalGradient preconditioner_in_;
 };
 
-
-/// This class implements an affine transform using a block diagonal matrix
-/// e.g., one whose weight matrix is all zeros except for blocks on the
-/// diagonal. All these blocks have the same dimensions.
-///  input-dim: num cols of block diagonal matrix.
-///  output-dim: num rows of block diagonal matrix.
-/// num-blocks: number of blocks in diagonal of the matrix.
-/// num-blocks must divide both input-dim and output-dim
-class BlockAffineComponent : public UpdatableComponent {
- public:
-  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
-  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-
-  BlockAffineComponent() { }
-  virtual std::string Type() const { return "BlockAffineComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
-      kBackpropNeedsInput|kBackpropAdds;
-  }
-
-  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // BlockAffine-specific functions.
-  void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
-            BaseFloat param_stddev, BaseFloat bias_mean,
-            BaseFloat bias_stddev);
-  explicit BlockAffineComponent(const BlockAffineComponent &other);
- protected:
-  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
-  // equal size.  The blocks are stored in linear_params_ as
-  // [ M
-  //   N
-  //   O ] but we actually treat it as the matrix:
-  // [ M 0 0
-  //   0 N 0
-  //   0 0 O ]
-  CuMatrix<BaseFloat> linear_params_;
-  CuVector<BaseFloat> bias_params_;
-  int32 num_blocks_;
- private:
-  const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
-};
-
 class SoftmaxComponent: public NonlinearComponent {
  public:
   explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
@@ -1643,7 +1645,7 @@ class MaxpoolingComponent: public Component {
 };
 
 /**
-   CompositeComponent is components representing a sequence of
+   CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
    (imagine this is all on one line):
 
@@ -1659,6 +1661,10 @@ class MaxpoolingComponent: public Component {
    much memory for very long (and you can make the memory usage very small by
    making max-rows-process small).  We inherit from UpdatableComponent just in
    case one or more of the components in the sequence are updatable.
+
+   It is an error to nest a CompositeComponent inside a CompositeComponent.
+   The same effect can be accomplished by specifying a smaller max-rows-process
+   in a single CompositeComponent.
  */
 class CompositeComponent: public UpdatableComponent {
  public:
@@ -1724,6 +1730,18 @@ class CompositeComponent: public UpdatableComponent {
   // want to store stats, as part of the backprop pass.  This is not 100% ideal
   // but it will usually do what you want.  We can revisit this later if needed.
 
+  // Functions to iterate over the internal components
+
+  int32 NumComponents() const { return components_.size();}
+  /// Gets the ith component in this component.
+  /// The ordering is the same as in the config line. The caller
+  /// does not own the received component.
+  const Component* GetComponent(int32 i) const;
+  /// Sets the ith component. After this call, CompositeComponent owns
+  /// the reference to the argument component. Frees the previous
+  /// ith component.
+  void SetComponent(int32 i, Component *component);
+
   virtual ~CompositeComponent() { DeletePointers(&components_); }
  protected:
   // returns true if at least one of 'components_' returns the kUpdatable flag
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 97720a0f2c0..8286b7d8782 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -2,6 +2,7 @@
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 // Copyright      2015  Vijayaditya Peddinti
+// Copyright      2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -712,7 +713,47 @@ void GenerateConfigSequenceDistribute(
   configs->push_back(os.str());
 }
 
-
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs) {
+  int32 num_components = RandInt(1,5);
+  int32 input_dim = 10 * RandInt(1,10);
+  if (opts.output_dim > 0) {
+    KALDI_WARN  << "This function doesn't take a requested output_dim due to "
+      "implementation complications.";
+  }
+  int32 max_rows_process = 512 + 512 * RandInt(1,3);
+  std::ostringstream os;
+  os << "component name=composite1 type=CompositeComponent max-rows-process=" 
+     << max_rows_process << " num-components=" << num_components;
+
+  int32 types_length = 3;
+  std::string types[] = {"BlockAffineComponent",
+                         "RepeatedAffineComponent",
+                         "NaturalGradientRepeatedAffineComponent"};
+  int32 last_output_dim = input_dim;
+  // components within a composite component are indexed from 1.
+  for(int32 i = 1; i <= num_components; i++) {
+    os << " component" << i << "=";
+    int32 rand_index = RandInt(0, types_length - 1);
+    std::string rand_type = types[rand_index];
+    os << "'type=" << rand_type << " input-dim=" << last_output_dim;
+    int32 current_output_dim = 10 * RandInt(1,10);
+    // must be a divisor or current_output_dim and last_output_dim
+    int32 num_repeats = 10;
+    os << " output-dim=" << current_output_dim;
+    std::string repeats_string = (rand_type == "BlockAffineComponent") ? "num-blocks": "num-repeats";
+    os << " " << repeats_string << "=" << num_repeats << "'";
+    last_output_dim = current_output_dim;
+  }
+  os << std::endl << std::endl;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component-node name=composite1 component=composite1 input=input\n";
+  os << "output-node name=output input=composite1\n";
+  configs->push_back(os.str());
+}
 
 void GenerateConfigSequence(
     const NnetGenerationOptions &opts,
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index 25d4c21de41..564de7cfcd1 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-test-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+// Copyright   2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -59,6 +59,11 @@ struct NnetGenerationOptions {
 void GenerateConfigSequence(const NnetGenerationOptions &opts,
                             std::vector<std::string> *configs);
 
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs);
 
 /**  This function computes an example computation request, for testing purposes.
      The "Simple" in the name means that it currently only supports neural nets
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 3efc82b6cf2..359254f4794 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -124,7 +124,8 @@ void ObjectiveFunctionInfo::UpdateStats(
     int32 minibatches_per_phase,
     int32 minibatch_counter,
     BaseFloat this_minibatch_weight,
-    BaseFloat this_minibatch_tot_objf) {
+    BaseFloat this_minibatch_tot_objf,
+    BaseFloat this_minibatch_tot_aux_objf) {
   int32 phase = minibatch_counter / minibatches_per_phase;
   if (phase != current_phase) {
     KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
@@ -132,11 +133,14 @@ void ObjectiveFunctionInfo::UpdateStats(
     current_phase = phase;
     tot_weight_this_phase = 0.0;
     tot_objf_this_phase = 0.0;
+    tot_aux_objf_this_phase = 0.0;
   }
   tot_weight_this_phase += this_minibatch_weight;
   tot_objf_this_phase += this_minibatch_tot_objf;
+  tot_aux_objf_this_phase += this_minibatch_tot_aux_objf;
   tot_weight += this_minibatch_weight;
   tot_objf += this_minibatch_tot_objf;
+  tot_aux_objf += this_minibatch_tot_aux_objf;
 }
 
 void ObjectiveFunctionInfo::PrintStatsForThisPhase(
@@ -144,19 +148,40 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase(
     int32 minibatches_per_phase) const {
   int32 start_minibatch = current_phase * minibatches_per_phase,
       end_minibatch = start_minibatch + minibatches_per_phase - 1;
-  KALDI_LOG << "Average objective function for '" << output_name
-            << "' for minibatches " << start_minibatch
-            << '-' << end_minibatch << " is "
-            << (tot_objf_this_phase / tot_weight_this_phase) << " over "
-            << tot_weight_this_phase << " frames.";
+
+  if (tot_aux_objf_this_phase == 0.0) {
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << (tot_objf_this_phase / tot_weight_this_phase) << " over "
+              << tot_weight_this_phase << " frames.";
+  } else {
+    BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase),
+        aux_objf = (tot_aux_objf_this_phase / tot_weight_this_phase),
+        sum_objf = objf + aux_objf;
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << objf << " + " << aux_objf << " = " << sum_objf
+              << " over " << tot_weight_this_phase << " frames.";
+  }
 }
 
 bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const {
-  KALDI_LOG << "Overall average objective function for '" << name << "' is "
-            << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  BaseFloat objf = (tot_objf / tot_weight),
+        aux_objf = (tot_aux_objf / tot_weight),
+        sum_objf = objf + aux_objf;
+  if (tot_aux_objf == 0.0) {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  } else {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << objf << " + " << aux_objf << " = " << sum_objf        
+              << " over " << tot_weight << " frames.";
+  }
   KALDI_LOG << "[this line is to be parsed by a script:] "
             << "log-prob-per-frame="
-            << (tot_objf / tot_weight);
+            << objf;
   return (tot_weight != 0.0);
 }
 
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index b5ab61bb47c..7ad964084a7 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -84,14 +84,19 @@ struct ObjectiveFunctionInfo {
 
   double tot_weight;
   double tot_objf;
+  double tot_aux_objf;  // An 'auxiliary' objective function that is optional-
+                        // may be used when things like regularization are being
+                        // used.
 
   double tot_weight_this_phase;
   double tot_objf_this_phase;
+  double tot_aux_objf_this_phase;
 
   ObjectiveFunctionInfo():
       current_phase(0),
-      tot_weight(0.0), tot_objf(0.0),
-      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0) { }
+      tot_weight(0.0), tot_objf(0.0), tot_aux_objf(0.0),
+      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0),
+      tot_aux_objf_this_phase(0.0) { }
 
   // This function updates the stats and, if the phase has just changed,
   // prints a message indicating progress.  The phase equals
@@ -101,7 +106,8 @@ struct ObjectiveFunctionInfo {
                    int32 minibatches_per_phase,
                    int32 minibatch_counter,
                    BaseFloat this_minibatch_weight,
-                   BaseFloat this_minibatch_tot_objf);
+                   BaseFloat this_minibatch_tot_objf,
+                   BaseFloat this_minibatch_tot_aux_objf = 0.0);
 
   // Prints stats for the current phase.
   void PrintStatsForThisPhase(const std::string &output_name,
diff --git a/src/nnet3/nnet-utils-test.cc b/src/nnet3/nnet-utils-test.cc
index 62cb240a3c0..ef1588044b2 100644
--- a/src/nnet3/nnet-utils-test.cc
+++ b/src/nnet3/nnet-utils-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-utils-test.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+//           2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -45,6 +47,59 @@ void UnitTestNnetContext() {
   }
 }
 
+void UnitTestConvertRepeatedToBlockAffine() {
+  // a test without a composite component.
+  std::string config =
+    "component name=repeated-affine1 type=RepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=20\n"
+    "component name=relu1 type=RectifiedLinearComponent dim=200\n"
+    "component name=block-affine1 type=BlockAffineComponent "
+    "input-dim=200 output-dim=100 num-blocks=10\n"
+    "component name=relu2 type=RectifiedLinearComponent dim=100\n"
+    "component name=repeated-affine2 type=NaturalGradientRepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=10\n"
+    "\n"
+    "input-node name=input dim=100\n"
+    "component-node name=repeated-affine1 component=repeated-affine1 input=input\n"
+    "component-node name=relu1 component=relu1 input=repeated-affine1\n"
+    "component-node name=block-affine1 component=block-affine1 input=relu1\n"
+    "component-node name=relu2 component=relu2 component=relu2 input=block-affine1\n"
+    "component-node name=repeated-affine2 component=repeated-affine2 input=relu2\n"
+    "output-node name=output input=repeated-affine2\n";
+
+  Nnet nnet;
+  std::istringstream is(config);
+  nnet.ReadConfig(is);
+  ConvertRepeatedToBlockAffine(&nnet);
+
+  for(int i = 0; i < nnet.NumComponents(); i++) {
+    Component *c = nnet.GetComponent(i);
+    KALDI_ASSERT(c->Type() != "RepeatedAffineComponent"
+                 && c->Type() != "NaturalGradientRepeatedAffineComponent");
+  }
+}
+
+void UnitTestConvertRepeatedToBlockAffineComposite() {
+  // test that repeated affine components nested within a CompositeComponent
+  // are converted.
+  struct NnetGenerationOptions gen_config;
+  gen_config.output_dim = 0;
+  std::vector<std::string> configs;
+  // this function generates a neural net with one component:
+  // a composite component.
+  GenerateConfigSequenceCompositeBlock(gen_config, &configs);
+  Nnet nnet;
+  std::istringstream is(configs[0]);
+  nnet.ReadConfig(is);
+  KALDI_ASSERT(nnet.NumComponents() == 1);
+  ConvertRepeatedToBlockAffine(&nnet);
+  CompositeComponent *cc = dynamic_cast<CompositeComponent*>(nnet.GetComponent(0));
+  for(int i = 0; i < cc->NumComponents(); i++) {
+    const Component *c = cc->GetComponent(i);
+    KALDI_ASSERT(c->Type() == "BlockAffineComponent");
+  }
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -54,6 +109,8 @@ int main() {
   SetVerboseLevel(2);
 
   UnitTestNnetContext();
+  UnitTestConvertRepeatedToBlockAffine();
+  UnitTestConvertRepeatedToBlockAffineComposite();
 
   KALDI_LOG << "Nnet tests succeeded.";
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 8b2eb787f84..89557950146 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1,7 +1,8 @@
 // nnet3/nnet-utils.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+//                2016  Daniel Galvez
+//
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-simple-component.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -40,10 +42,8 @@ int32 NumInputNodes(const Nnet &nnet) {
 
 
 bool IsSimpleNnet(const Nnet &nnet) {
-  // check that we have just one output node and it is
-  // called "output".
-  if (NumOutputNodes(nnet) != 1 ||
-      nnet.GetNodeIndex("output") == -1 ||
+  // check that we have an output node and called "output".
+  if (nnet.GetNodeIndex("output") == -1 ||
       !nnet.IsOutputNode(nnet.GetNodeIndex("output")))
     return false;
   // check that there is an input node named "input".
@@ -101,9 +101,18 @@ static void ComputeSimpleNnetContextForShift(
     input.indexes.push_back(Index(n, t));
     output.indexes.push_back(Index(n, t));
   }
-  // the assumption here is that the network just requires the ivector at time
-  // t=0.
-  ivector.indexes.push_back(Index(n, 0));
+
+  {
+    // the assumption here is that the network just requires the ivector at time
+    // t=0.
+    if (input_start > 0)
+      ivector.indexes.push_back(Index(n, 0));
+
+    // but its okay to add other indices
+    for (int32 t = input_start; t < input_end; t++) {
+      ivector.indexes.push_back(Index(n, t));
+    }
+  } 
 
   ComputationRequest request;
   request.inputs.push_back(input);
@@ -248,6 +257,22 @@ void ZeroComponentStats(Nnet *nnet) {
   }
 }
 
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                     Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      uc->SetLearningRate(uc->LearningRate() * learning_rate_scale);
+    }
+  }
+}
+
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -264,6 +289,57 @@ void SetLearningRate(BaseFloat learning_rate,
   }
 }
 
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                     Nnet *nnet) {
+  for (int32 c = 0, i = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < learning_rates.Dim());
+      uc->SetLearningRate(learning_rates(i++));
+    }
+  }
+}
+
+void GetLearningRates(const Nnet &nnet, 
+                      Vector<BaseFloat> *learning_rates) {
+  learning_rates->Resize(NumUpdatableComponents(nnet));
+  for (int32 c = 0, i = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      (*learning_rates)(i++) = uc->LearningRate();
+    }
+  }
+}
+
+void ScaleNnetComponents(const Vector<BaseFloat> &scale_factors,
+                         Nnet *nnet) {
+  for (int32 c = 0, i = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < scale_factors.Dim());
+      uc->Scale(scale_factors(i++));
+    }
+  }
+}
+
 void ScaleNnet(BaseFloat scale, Nnet *nnet) {
   if (scale == 1.0) return;
   else if (scale == 0.0) {
@@ -359,6 +435,51 @@ int32 NumUpdatableComponents(const Nnet &dest) {
   return ans;
 }
 
+void ConvertRepeatedToBlockAffine(CompositeComponent *c_component) {
+  for(int32 i = 0; i < c_component->NumComponents(); i++) {
+    const Component *c = c_component->GetComponent(i);
+    KALDI_ASSERT(c->Type() != "CompositeComponent" &&
+                 "Nesting CompositeComponent within CompositeComponent is not allowed.\n"
+                 "(We may change this as more complicated components are introduced.)");
+
+    if(c->Type() == "RepeatedAffineComponent" ||
+       c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      c_component->SetComponent(i, bac);
+    }
+  }
+}
+
+void ConvertRepeatedToBlockAffine(Nnet *nnet) {
+  for(int32 i = 0; i < nnet->NumComponents(); i++) {
+    const Component *const_c = nnet->GetComponent(i);
+    if(const_c->Type() == "RepeatedAffineComponent" ||
+       const_c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(const_c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      nnet->SetComponent(i, bac);
+    } else if (const_c->Type() == "CompositeComponent") {
+      // We must modify the composite component, so we use the
+      // non-const GetComponent() call here.
+      Component *c = nnet->GetComponent(i);
+      CompositeComponent *cc = dynamic_cast<CompositeComponent*>(c);
+      KALDI_ASSERT(cc != NULL);
+      ConvertRepeatedToBlockAffine(cc);
+    }
+  }
+}
+
 std::string NnetInfo(const Nnet &nnet) {
   std::ostringstream ostr;
   if (IsSimpleNnet(nnet)) {
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 9241d43f54d..f519b774382 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -85,7 +85,8 @@ std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
                                              const VectorBase<BaseFloat> &vec);
 
 /// This function returns true if the nnet has the following properties:
-///  It has one output, called "output".
+///  It has an called "output" (other outputs are allowed but may be
+///          ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.
 ///  There are probably some other properties that we really ought to
@@ -113,9 +114,22 @@ void ComputeSimpleNnetContext(const Nnet &nnet,
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet);
 
+/// Scales the learning rate for all the components in the nnet by this factor
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                       Nnet *nnet);
+
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                      Nnet *nnet);
+
+void GetLearningRates(const Nnet &nnet,
+                      Vector<BaseFloat> *learning_rates);
+
 /// Scales the nnet parameters and stats by this scale.
 void ScaleNnet(BaseFloat scale, Nnet *nnet);
 
+void ScaleNnetComponents(const Vector<BaseFloat> &learning_rates,
+                         Nnet *nnet);
+
 /// Does *dest += alpha * src (affects nnet parameters and
 ///  stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
@@ -138,6 +152,9 @@ void UnVectorizeNnet(const VectorBase<BaseFloat> &params,
 /// Returns the number of updatable components in the nnet.
 int32 NumUpdatableComponents(const Nnet &dest);
 
+/// Convert all components of type RepeatedAffineComponent or
+/// NaturalGradientRepeatedAffineComponent to BlockAffineComponent in nnet.
+void ConvertRepeatedToBlockAffine(Nnet *nnet);
 
 /// This function returns various info about the neural net.
 /// If the nnet satisfied IsSimpleNnet(nnet), the info includes "left-context=5\nright-context=3\n...".  The info includes
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 0a57c17fad0..e2c7d4a3b1c 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -12,7 +12,14 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
    nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
    nnet3-copy nnet3-show-progress nnet3-align-compiled \
-   nnet3-get-egs-dense-targets nnet3-compute
+   nnet3-get-egs-dense-targets nnet3-compute \
+	 nnet3-discriminative-get-egs nnet3-discriminative-copy-egs \
+	 nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
+	 nnet3-discriminative-compute-objf nnet3-discriminative-train \
+	 discriminative-get-supervision nnet3-modify-learning-rates \
+	 nnet3-discriminative-copy-egs-to-nnet2 \
+	 nnet3-discriminative-copy-egs-from-nnet2 \
+	 nnet3-compute-from-degs nnet3-discriminative-subset-egs
 
 OBJFILES =
 
@@ -21,11 +28,11 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../chain/kaldi-chain.a ../nnet3/kaldi-nnet3.a ../nnet2/kaldi-nnet2.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
          ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
          ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
-         ../util/kaldi-util.a ../base/kaldi-base.a
+         ../util/kaldi-util.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/discriminative-get-supervision.cc b/src/nnet3bin/discriminative-get-supervision.cc
new file mode 100644
index 00000000000..3fc03bdc24f
--- /dev/null
+++ b/src/nnet3bin/discriminative-get-supervision.cc
@@ -0,0 +1,165 @@
+// nnet3bin/discriminative-get-supervision.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/discriminative-supervision.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::discriminative;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get a discriminative training supervision object for each file of training data.\n"
+        "This will normally be piped into nnet3-discriminative-get-egs, where it\n"
+        "will be split up into pieces and combined with the features.\n"
+        "Input can come in two formats: \n"
+        "numerator alignments / denominator lattice pair \n"
+        ", or numerator and denominator lattice pair\n"
+        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
+        "<den-lattice-rspecifier> <supervision-wspecifier>\n";
+
+    std::string num_lat_rspecifier;
+    std::string oracle_rspecifier;
+    std::string frame_weights_rspecifier;
+
+    DiscriminativeSupervisionOptions sup_opts;
+
+    ParseOptions po(usage);
+    po.Register("num-lat-rspecifier", &num_lat_rspecifier, "Get supervision "
+                "with numerator lattice");
+    po.Register("oracle-rspecifier", &oracle_rspecifier, "Add oracle "
+                "alignment to supervision");
+    po.Register("frame-weights-rspecifier", &frame_weights_rspecifier,
+                "Add frame weights to supervision");
+
+    sup_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string num_ali_rspecifier = po.GetArg(1),
+                den_lat_rspecifier = po.GetArg(2),
+                supervision_wspecifier = po.GetArg(3);
+
+    DiscriminativeSupervisionWriter supervision_writer(supervision_wspecifier);
+    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
+    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
+
+    RandomAccessLatticeReader num_lat_reader(num_lat_rspecifier);
+    RandomAccessInt32VectorReader oracle_reader(oracle_rspecifier);
+    RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
+
+    int32 num_utts_done = 0, num_utts_error = 0;
+
+    for (; !ali_reader.Done(); ali_reader.Next())  {
+      const std::string &key = ali_reader.Key();
+      const std::vector<int32> &num_ali = ali_reader.Value();
+      
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find denominator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      if (!num_lat_rspecifier.empty() && !num_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find numerator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+      
+      if (!oracle_rspecifier.empty() && !oracle_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find oracle alignment for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      if (!frame_weights_rspecifier.empty() && !frame_weights_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find frame weights for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      Vector<BaseFloat> frame_weights;
+      std::vector<int32> oracle_ali;
+      
+      if (!oracle_rspecifier.empty()) {
+        oracle_ali = oracle_reader.Value(key);
+      }
+
+      if (!frame_weights_rspecifier.empty()) {
+        frame_weights = frame_weights_reader.Value(key);
+      }
+
+      const Lattice &den_lat = den_lat_reader.Value(key);
+
+      DiscriminativeSupervision supervision;
+
+      if (!num_lat_rspecifier.empty()) {
+        const Lattice &num_lat = num_lat_reader.Value(key);
+        if (!LatticeToDiscriminativeSupervision(num_ali,
+            num_lat, den_lat, 1.0, &supervision, 
+            (!frame_weights_rspecifier.empty() ? &frame_weights : NULL), 
+            (!oracle_rspecifier.empty() ? &oracle_ali : NULL))) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_utts_error++;
+          continue;
+        }
+      } else {
+        if (!LatticeToDiscriminativeSupervision(num_ali,
+            den_lat, 1.0, &supervision,
+            (!frame_weights_rspecifier.empty() ? &frame_weights : NULL), 
+            (!oracle_rspecifier.empty() ? &oracle_ali : NULL))) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_utts_error++;
+          continue;
+        }
+      }
+
+      supervision_writer.Write(key, supervision);
+      
+      num_utts_done++;
+    } 
+    
+    KALDI_LOG << "Generated discriminative supervision information for "
+              << num_utts_done << " utterances, errors on "
+              << num_utts_error;
+    return (num_utts_done > num_utts_error ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index f04f5f61215..b58ae10e7d6 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -81,6 +81,8 @@ int main(int argc, char *argv[]) {
     po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
                 "between iVectors in matrices supplied to the --online-ivectors "
                 "option");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Read(argc, argv);
 
     if (po.NumArgs() < 4 || po.NumArgs() > 5) {
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 1a66615b430..dd38288418e 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -1,6 +1,7 @@
 // nnet3bin/nnet3-am-copy.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2016 Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -46,7 +47,9 @@ int main(int argc, char *argv[]) {
     bool binary_write = true,
         raw = false;
     BaseFloat learning_rate = -1;
+    BaseFloat learning_rate_scale = 1;
     std::string set_raw_nnet = "";
+    bool convert_repeated_to_block = false;
     BaseFloat scale = 1.0;
 
     ParseOptions po(usage);
@@ -57,9 +60,16 @@ int main(int argc, char *argv[]) {
                 "Set the raw nnet inside the model to the one provided in "
                 "the option string (interpreted as an rxfilename).  Done "
                 "before the learning-rate is changed.");
+    po.Register("convert-repeated-to-block", &convert_repeated_to_block,
+                "Convert all RepeatedAffineComponents and "
+                "NaturalGradientRepeatedAffineComponents to "
+                "BlockAffineComponents in the model. Done after set-raw-nnet.");
     po.Register("learning-rate", &learning_rate,
                 "If supplied, all the learning rates of updatable components"
                 " are set to this value.");
+    po.Register("learning-rate-scale", &learning_rate_scale,
+                "Scales the learning rate of updatable components by this "
+                "factor");
     po.Register("scale", &scale, "The parameter matrices are scaled"
                 " by the specified value.");
 
@@ -89,8 +99,16 @@ int main(int argc, char *argv[]) {
       am_nnet.SetNnet(nnet);
     }
 
+    if(convert_repeated_to_block)
+      ConvertRepeatedToBlockAffine(&(am_nnet.GetNnet()));
+
     if (learning_rate >= 0)
       SetLearningRate(learning_rate, &(am_nnet.GetNnet()));
+    
+    KALDI_ASSERT(learning_rate_scale >= 0.0);
+
+    if (learning_rate_scale != 1.0)
+      ScaleLearningRate(learning_rate_scale, &(am_nnet.GetNnet()));
 
     if (scale != 1.0)
       ScaleNnet(scale, &(am_nnet.GetNnet()));
diff --git a/src/nnet3bin/nnet3-compute-from-degs.cc b/src/nnet3bin/nnet3-compute-from-degs.cc
new file mode 100644
index 00000000000..bf463fb3690
--- /dev/null
+++ b/src/nnet3bin/nnet3-compute-from-degs.cc
@@ -0,0 +1,148 @@
+// nnet3bin/nnet3-compute-from-degs.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-optimize.h"
+#include "transform/lda-estimate.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+class NnetComputerFromEg {
+ public:
+  NnetComputerFromEg(const Nnet &nnet):
+      nnet_(nnet), compiler_(nnet) { }
+
+  // Compute the output (which will have the same number of rows as the number
+  // of Indexes in the output of the eg), and put it in "output".
+  void Compute(const NnetExample &eg, Matrix<BaseFloat> *output) {
+    ComputationRequest request;
+    bool need_backprop = false, store_stats = false;
+    GetComputationRequest(nnet_, eg, need_backprop, store_stats, &request);
+    const NnetComputation &computation = *(compiler_.Compile(request));
+    NnetComputeOptions options;
+    if (GetVerboseLevel() >= 3)
+      options.debug = true;
+    NnetComputer computer(options, computation, nnet_, NULL);
+    computer.AcceptInputs(nnet_, eg.io);
+    computer.Forward();
+    const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
+    output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+    nnet_output.CopyToMat(output);
+  }
+ private:
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  
+};
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Read input nnet training examples, and compute the output for each one.\n"
+        "If --apply-exp=true, apply the Exp() function to the output before writing\n"
+        "it out.\n"
+        "\n"
+        "Usage:  nnet3-compute-from-egs [options] <raw-nnet-in> <training-examples-in> <matrices-out>\n"
+        "e.g.:\n"
+        "nnet3-compute-from-egs --apply-exp=true 0.raw ark:1.egs ark:- | matrix-sum-rows ark:- ... \n"
+        "See also: nnet3-compute\n";
+    
+    bool binary_write = true,
+        apply_exp = false;
+    std::string use_gpu = "yes";
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        matrix_wspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    NnetComputerFromEg computer(nnet);
+
+    int64 num_egs = 0;
+    
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+    
+    for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
+      Matrix<BaseFloat> output;
+      NnetExample eg;
+      NnetDiscriminativeExample disc_eg = example_reader.Value();
+      eg.io.swap(disc_eg.inputs);
+
+      for (int32 i = 0; i < disc_eg.outputs.size(); i++) {
+        NnetIo io;
+        io.name = disc_eg.outputs[i].name;
+        io.indexes = disc_eg.outputs[i].indexes;
+        eg.io.push_back(io);
+      }
+
+      computer.Compute(eg, &output);
+      KALDI_ASSERT(output.NumRows() != 0);
+      if (apply_exp)
+        output.ApplyExp();
+      matrix_writer.Write(example_reader.Key(), output);
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    KALDI_LOG << "Processed " << num_egs << " examples.";
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-compute-objf.cc b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
new file mode 100644
index 00000000000..ea358136fb7
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
@@ -0,0 +1,93 @@
+// nnet3bin/nnet3-discriminative-compute-objf.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Computes and prints to in logging messages the objective function per frame of\n"
+        "the given data with an nnet3 neural net.  The input of this is the output of\n"
+        "e.g. nnet3-discriminative-get-egs | nnet3-discriminative-merge-egs.\n"
+        "\n"
+        "Usage:  nnet3-discrminative-compute-prob [options] <nnet3-model-in> <training-examples-in>\n"
+        "e.g.: nnet3-discriminative-compute-prob 0.mdl ark:valid.degs\n";
+
+
+    // This program doesn't support using a GPU, because these probabilities are
+    // used for diagnostics, and you can just compute them with a small enough
+    // amount of data that a CPU can do it within reasonable time.
+    // It wouldn't be hard to make it support GPU, though.
+
+    NnetComputeProbOptions nnet_opts;
+    discriminative::DiscriminativeTrainingOptions discriminative_training_opts;
+
+    ParseOptions po(usage);
+
+    nnet_opts.Register(&po);
+    discriminative_training_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    NnetDiscriminativeComputeObjf discriminative_objf_computer(nnet_opts, 
+                                              discriminative_training_opts, 
+                                              tmodel, am_nnet.Priors(), am_nnet.GetNnet());
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      discriminative_objf_computer.Compute(example_reader.Value());
+
+    bool ok = discriminative_objf_computer.PrintTotalStats();
+
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs-from-nnet2.cc b/src/nnet3bin/nnet3-discriminative-copy-egs-from-nnet2.cc
new file mode 100644
index 00000000000..f5f4581933f
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs-from-nnet2.cc
@@ -0,0 +1,216 @@
+// nnet3bin/nnet3-discriminative-copy-egs-from-nnet2.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet2/nnet-example.h"
+#include "nnet2/nnet-example-functions.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+
+void ConvertToNnet3(const nnet2::DiscriminativeNnetExample &eg,
+                    int32 fixed_vector_dim,
+                    nnet3::NnetDiscriminativeExample *nnet3_eg) {
+  nnet3_eg->outputs.resize(1);
+  nnet3_eg->outputs[0].supervision.num_ali = eg.num_ali;
+  ConvertLattice(eg.den_lat, &(nnet3_eg->outputs[0].supervision.den_lat));
+  fst::TopSort(&(nnet3_eg->outputs[0].supervision.den_lat));
+
+  KALDI_ASSERT(eg.spk_info.Dim() == 0 || fixed_vector_dim >= 0);
+  // If eg.spk_info has zero dimension, the fixed vector might be in the 
+  // input_frames. Then fixed_vector_dim will be used to determine the 
+  // dimension of the speaker info
+  int32 ivector_dim = eg.spk_info.Dim();
+  int32 feat_dim = eg.input_frames.NumCols();
+
+  if (eg.spk_info.Dim() == 0) {
+    ivector_dim = fixed_vector_dim;
+    feat_dim -= fixed_vector_dim;
+  }
+  KALDI_ASSERT(feat_dim > 0);
+  int32 nrows = eg.input_frames.NumRows();
+
+  nnet3_eg->inputs.resize(ivector_dim > 0 ? 2 : 1);
+
+  nnet3_eg->inputs[0].name = "input";
+  SubMatrix<BaseFloat> feats(eg.input_frames, 0, nrows, 0, feat_dim);
+  nnet3_eg->inputs[0].features = feats;
+
+  nnet3_eg->inputs[0].indexes.resize(nrows);
+  for (int32 i = 0; i < nrows; i++) {
+    nnet3_eg->inputs[0].indexes[i] = nnet3::Index(0, i - eg.left_context, 0);
+  }
+  
+  if (ivector_dim > 0) {
+    nnet3_eg->inputs[1].name = "ivector";
+    Matrix<BaseFloat> ivector;
+    
+    if (eg.spk_info.Dim() == 0) {
+      ivector.Resize(nrows, ivector_dim);
+      ivector.CopyFromMat(eg.input_frames.Range(0, nrows, feat_dim, ivector_dim));
+      nnet3_eg->inputs[1].indexes.resize(nrows);
+      for (int32 i = 0; i < nrows; i++) {
+        nnet3_eg->inputs[1].indexes[i] = nnet3::Index(0, i - eg.left_context, 0);
+      }
+    } else {
+      ivector.Resize(1, ivector_dim);
+      ivector.Row(0).CopyFromVec(eg.spk_info);
+      nnet3_eg->inputs[1].indexes.resize(1);
+      nnet3_eg->inputs[1].indexes[0] = nnet3::Index(0, 0, 0);
+    }
+    nnet3_eg->inputs[1].features = ivector;
+  }
+
+  nnet3_eg->outputs[0].name = "output";
+  nnet3_eg->outputs[0].indexes.resize(eg.num_ali.size());
+  for (int32 i = 0; i < eg.num_ali.size(); i++) {
+    nnet3_eg->outputs[0].indexes[i] = nnet3::Index(0, i, 0);
+  }
+  
+  nnet3_eg->outputs[0].supervision.weight = eg.weight;
+  nnet3_eg->outputs[0].supervision.frames_per_sequence = eg.num_ali.size();
+  nnet3_eg->outputs[0].supervision.num_sequences = 1;
+  nnet3_eg->outputs[0].CheckDim();
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples to nnet3 discriminative training, possibly changing the binary mode \n"
+        "from nnet2 examples\n"
+        "\n"
+        "Usage:  nnet3-discriminative-copy-egs-from-nnet2 [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs-from-nnet2 ark:train.degs ark,t:text.degs\n"
+        "or:\n"
+        "nnet3-discriminative-copy-egs-from-nnet2 ark:train.degs ark:1.degs ark:2.degs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+    int32 max_length = -1;
+    int32 fixed_vector_dim = -1;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+    po.Register("max-length", &max_length, "Pads small segments so that "
+                "they are all of the size max_length");
+    po.Register("fixed-vector-dim", &fixed_vector_dim, "i-vector dimension");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string examples_rspecifier = po.GetArg(1);
+
+    nnet2::SequentialDiscriminativeNnetExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<NnetDiscriminativeExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new NnetDiscriminativeExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      nnet2::DiscriminativeNnetExample eg = example_reader.Value();
+      if (max_length >= 0 && !PadDiscriminativeExamples(max_length, &eg)) continue;
+
+      NnetDiscriminativeExample nnet3_eg;
+      ConvertToNnet3(eg, fixed_vector_dim, &nnet3_eg);
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, nnet3_eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        if (frame_shift != 0)
+          ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &nnet3_eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &nnet3_eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, nnet3_eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs-to-nnet2.cc b/src/nnet3bin/nnet3-discriminative-copy-egs-to-nnet2.cc
new file mode 100644
index 00000000000..a1def464a90
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs-to-nnet2.cc
@@ -0,0 +1,186 @@
+// nnet3bin/nnet3-discriminative-copy-egs-to-nnet2.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet2/nnet-example.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+
+void AppendVectorToFeats(const MatrixBase<BaseFloat> &in,
+                         const VectorBase<BaseFloat> &vec,
+                         Matrix<BaseFloat> *out) {
+  KALDI_ASSERT(in.NumRows() != 0);
+  out->Resize(in.NumRows(), in.NumCols() + vec.Dim());
+  out->Range(0, in.NumRows(),
+             0, in.NumCols()).CopyFromMat(in);
+  out->Range(0, in.NumRows(),
+             in.NumCols(), vec.Dim()).CopyRowsFromVec(vec);
+}
+
+void ConvertToNnet2(const nnet3::NnetDiscriminativeExample &eg,
+                    nnet2::DiscriminativeNnetExample *nnet2_eg) {
+  nnet2_eg->num_ali = eg.outputs[0].supervision.num_ali;
+  ConvertLattice(eg.outputs[0].supervision.den_lat, &nnet2_eg->den_lat);
+
+  int32 feat_dim = eg.inputs[0].features.NumCols();
+  int32 ivector_dim = eg.inputs[1].features.NumCols();
+  int32 nrows = eg.inputs[0].features.NumRows();
+
+  if (eg.inputs[1].features.NumRows() == 1) {
+    Matrix<BaseFloat> feats(nrows, feat_dim);
+    eg.inputs[0].features.CopyToMat(&feats, kNoTrans);
+    Matrix<BaseFloat> ivector(1, ivector_dim);
+    eg.inputs[1].features.CopyToMat(&ivector, kNoTrans);
+
+    AppendVectorToFeats(feats, ivector.Row(0),
+                        &(nnet2_eg->input_frames));
+  } else {
+    nnet2_eg->input_frames.Resize(nrows, feat_dim + ivector_dim);
+    SubMatrix<BaseFloat> feats(nnet2_eg->input_frames, 0, nrows, 0, feat_dim);
+    eg.inputs[0].features.CopyToMat(&feats, kNoTrans);
+    SubMatrix<BaseFloat> ivector(nnet2_eg->input_frames, 0, nrows, feat_dim, ivector_dim);
+    eg.inputs[1].features.CopyToMat(&ivector, kNoTrans);
+  }
+
+  nnet2_eg->left_context = -eg.inputs[0].indexes[0].t;
+  nnet2_eg->weight = eg.outputs[0].supervision.weight;
+  nnet2_eg->Check();
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples for nnet3 discriminative training, possibly changing the binary mode.\n"
+        "into nnet2 examples\n"
+        "\n"
+        "Usage:  nnet3-discriminative-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs-to-nnet2 ark:train.degs ark,t:text.degs\n"
+        "or:\n"
+        "nnet3-discriminative-copy-egs-to-nnet2 ark:train.degs ark:1.degs ark:2.degs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<nnet2::DiscriminativeNnetExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new nnet2::DiscriminativeNnetExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        const NnetDiscriminativeExample &eg = example_reader.Value();
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          nnet2::DiscriminativeNnetExample nnet2_eg;
+          ConvertToNnet2(eg, &nnet2_eg);
+          example_writers[index]->Write(key, nnet2_eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        NnetDiscriminativeExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          nnet2::DiscriminativeNnetExample nnet2_eg;
+          ConvertToNnet2(eg, &nnet2_eg);
+          example_writers[index]->Write(key, nnet2_eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
new file mode 100644
index 00000000000..831484ebb11
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -0,0 +1,139 @@
+// nnet3bin/nnet3-discriminative-copy-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples for nnet3 discriminative training, possibly changing the binary mode.\n"
+        "Supports multiple wspecifiers, in which case it will write the examples\n"
+        "round-robin to the outputs.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark,t:text.degs\n"
+        "or:\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark:1.degs ark:2.degs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<NnetDiscriminativeExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new NnetDiscriminativeExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        const NnetDiscriminativeExample &eg = example_reader.Value();
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        NnetDiscriminativeExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
new file mode 100644
index 00000000000..8dc7090600f
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -0,0 +1,357 @@
+// nnet3bin/nnet3-discriminative-get-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/**
+   This function does all the processing for one utterance, and outputs the
+   supervision objects to 'example_writer'.  
+*/
+
+static bool ProcessFile(
+                        const discriminative::SplitDiscriminativeSupervisionOptions &config,
+                        const TransitionModel &tmodel,
+                        const MatrixBase<BaseFloat> &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        const discriminative::DiscriminativeSupervision &supervision,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 left_context,
+                        int32 right_context,
+                        int32 frames_per_eg,
+                        int32 frames_overlap_per_eg,
+                        int32 frame_subsampling_factor,
+                        int64 *num_frames_written,
+                        int64 *num_egs_written,
+                        NnetDiscriminativeExampleWriter *example_writer) {
+  KALDI_ASSERT(supervision.num_sequences == 1);
+  int32 num_feature_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence,
+      num_feature_frames_subsampled =
+                             (num_feature_frames + frame_subsampling_factor - 1)/
+                             frame_subsampling_factor;
+  if (num_output_frames != num_feature_frames_subsampled)
+    KALDI_ERR << "Mismatch in num-frames: discriminative supervision has "
+              << num_output_frames
+              << " versus features/frame_subsampling_factor = "
+              << num_feature_frames << " / " << frame_subsampling_factor
+              << ": check that --frame-subsampling-factor option is set "
+              << "the same as to discriminative-get-supervision.";
+
+  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+
+  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
+      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
+      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+
+  if (frames_per_eg != -1 && num_feature_frames_subsampled < frames_per_eg_subsampled) {
+    KALDI_WARN << "No output for utterance " << utt_id
+               << " (num-frames=" << num_feature_frames
+               << ") because too short for --frames-per-eg="
+               << frames_per_eg;
+    return false;
+  }
+
+  // we don't do any padding, as it would be a bit tricky to pad the discriminative training supervision.
+  // Instead we select ranges of frames that fully fit within the file;  these
+  // might slightly overlap with each other or have gaps.
+  std::vector<int32> range_starts_subsampled;
+  if (frames_per_eg != -1) {
+    SplitIntoRanges(num_feature_frames_subsampled -
+                           frames_overlap_subsampled,
+                           frames_shift_subsampled,
+                           &range_starts_subsampled);
+  } else {
+    range_starts_subsampled.push_back(0);
+  }
+  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
+  // that we tend to avoid having nonzero weights on the derivatives that are
+  // too close to the edge of the corresponding 'range' (these derivatives close
+  // to the edge are not as accurate as they could be, because when we split we
+  // don't know the correct alphas and betas).
+  std::vector<Vector<BaseFloat> > deriv_weights;
+  if (frames_per_eg != -1) {
+    GetWeightsForRanges(frames_per_eg_subsampled,
+                        range_starts_subsampled,
+                        &deriv_weights);
+
+    if (range_starts_subsampled.empty()) {
+      KALDI_WARN << "No output for utterance " << utt_id
+                 << " (num-frames=" << num_feature_frames
+                 << ") because too short for --frames-per-eg="
+                 << frames_per_eg;
+      return false;
+    }
+  } else {
+    deriv_weights.push_back(Vector<BaseFloat>());
+  }
+
+  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel, 
+                                                             supervision);
+
+  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+
+    NnetDiscriminativeExample nnet_discriminative_eg;
+    nnet_discriminative_eg.outputs.resize(1);
+    int32 range_start_subsampled = range_starts_subsampled[i],
+        range_start = range_start_subsampled * frame_subsampling_factor;
+    
+    if (frames_per_eg != -1) {
+
+      discriminative::DiscriminativeSupervision supervision_part;
+
+      splitter.GetFrameRange(range_start_subsampled,
+                             frames_per_eg_subsampled,
+                             (i == 0 ? false : true),
+                             &supervision_part);
+
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    } else {
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    }
+
+    nnet_discriminative_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
+
+    int32 this_frames_per_eg = frames_per_eg != -1 ? frames_per_eg : supervision.frames_per_sequence;
+
+    int32 tot_frames = left_context + this_frames_per_eg + right_context;
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+
+    // Set up "input_frames".
+    for (int32 j = -left_context; j < this_frames_per_eg + right_context; j++) {
+      int32 t = range_start + j;
+      if (t < 0) t = 0;
+      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
+      SubVector<BaseFloat> src(feats, t),
+          dest(input_frames, j + left_context);
+      dest.CopyFromVec(src);
+    }
+    NnetIo input_io("input", - left_context,
+                    input_frames);
+    nnet_discriminative_eg.inputs[0].Swap(&input_io);
+
+    if (ivector_feats != NULL) {
+      // if applicable, add the iVector feature.
+      // try to get closest frame to middle of window to get
+      // a representative iVector.
+      int32 closest_frame = range_start + this_frames_per_eg / 2;
+      KALDI_ASSERT(ivector_feats->NumRows() > 0);
+      if (closest_frame >= ivector_feats->NumRows())
+        closest_frame = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      NnetIo ivector_io("ivector", 0, ivector);
+      nnet_discriminative_eg.inputs[1].Swap(&ivector_io);
+    }
+
+    if (compress)
+      nnet_discriminative_eg.Compress();
+
+    std::ostringstream os;
+    os << utt_id << "-" << range_start;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    *num_frames_written += this_frames_per_eg;
+    *num_egs_written += 1;
+
+    example_writer->Write(key, nnet_discriminative_eg);
+  }
+  return true;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3+sequence neural network\n"
+        "training.  This involves breaking up utterances into pieces of a\n"
+        "fixed size.  Input will come from discriminative-get-supervision.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-get-egs [options] <model> <features-rspecifier> "
+        "<discriminative-supervision-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "discriminative-get-supervision [args] | \\\n"
+        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=20 \\\n"
+        "  \"$feats\" ark,s,cs:- ark:degs.1.ark\n"
+        "Note: the --frame-subsampling-factor option must be the same as given to\n"
+        "discriminative-get-supervision.\n";
+
+    bool compress = true;
+    int32 left_context = 0, right_context = 0, num_frames = 1,
+        num_frames_overlap = 0, length_tolerance = 100,
+        frame_subsampling_factor = 1;
+
+    std::string ivector_rspecifier;
+    discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format (recommended)");
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
+    po.Register("num-frames", &num_frames, "Number of frames with labels "
+                "that each example contains.  Will be rounded up to a multiple "
+                "of --frame-subsampling-factor.");
+    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                "overlap between each example (could be useful in conjunction "
+                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
+                "Each time we shift by --num-frames minus --num-frames-overlap.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+                "features, as a matrix.");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                "if the frame-rate at the output will be less than the "
+                "frame-rate of the input");
+    
+    ParseOptions splitter_opts("supervision-splitter", &po);
+    splitter_config.Register(&splitter_opts);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (left_context < 0 || right_context < 0 ||
+        length_tolerance < 0 || frame_subsampling_factor <= 0)
+      KALDI_ERR << "One of the integer options is out of the allowed range.";
+
+    if (frame_subsampling_factor != 1)
+      RoundUpNumFrames(frame_subsampling_factor,
+                       &num_frames, &num_frames_overlap);
+
+    std::string model_wxfilename, feature_rspecifier,
+                supervision_rspecifier,
+                examples_wspecifier;
+
+    model_wxfilename = po.GetArg(1);
+    feature_rspecifier = po.GetArg(2);
+    supervision_rspecifier = po.GetArg(3);
+    examples_wspecifier = po.GetArg(4);
+
+    TransitionModel tmodel;
+    { 
+      bool binary;
+      Input ki(model_wxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+    }
+
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
+        supervision_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    int64 num_frames_written = 0, num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      if (!supervision_reader.HasKey(key)) {
+        KALDI_WARN << "No supervision for key " << key;
+        num_err++;
+      } else {
+        const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
+        const Matrix<BaseFloat> *ivector_feats = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            ivector_feats = &(ivector_reader.Value(key));
+          }
+        }
+        if (ivector_feats != NULL &&
+            (std::abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
+             || ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+        if (ProcessFile(splitter_config, tmodel,
+                        feats, ivector_feats, supervision,
+                        key, compress, left_context, right_context, num_frames,
+                        num_frames_overlap, frame_subsampling_factor,
+                        &num_frames_written, &num_egs_written,
+                        &example_writer))
+          num_done++;
+        else {
+          KALDI_WARN << "Failed to process utterance into nnet example "
+                     << "for key " << key;
+          num_err++;
+        }
+      }
+    }
+
+    KALDI_LOG << "Finished generating nnet3-discriminative examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples, "
+              << " with " << num_frames_written << " frames in total; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
new file mode 100644
index 00000000000..5c386bd40b3
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-merge-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This copies nnet3 discriminative training examples from input to output, merging them\n"
+        "into composite examples.  The --minibatch-size option controls how many egs\n"
+        "are merged into a single output eg.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
+        "See also nnet3-discriminative-copy-egs\n";
+
+    bool compress = false;
+    int32 minibatch_size = 64;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+
+    std::vector<NnetDiscriminativeExample> examples;
+    examples.reserve(minibatch_size);
+
+    int64 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetDiscriminativeExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        NnetDiscriminativeExample merged_eg;
+        MergeDiscriminativeExamples(compress, &examples, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+        examples.clear();
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
new file mode 100644
index 00000000000..2a029123852
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
@@ -0,0 +1,116 @@
+// nnet3bin/nnet3-discriminative-shuffle-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy nnet3 discriminative training examples from the input to output,\n"
+        "while randomly shuffling the order.  This program will keep all of the examples\n"
+        "in memory at once, unless you use the --buffer-size option\n"
+        "\n"
+        "Usage:  nnet3-discriminative-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "nnet3-discriminative-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
+
+    int32 srand_seed = 0;
+    int32 buffer_size = 0;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
+                "to do limited-memory partial randomization.  Otherwise, do "
+                "full randomization.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    int64 num_done = 0;
+
+    std::vector<std::pair<std::string, NnetDiscriminativeExample*> > egs;
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    if (buffer_size == 0) { // Do full randomization
+      // Putting in an extra level of indirection here to avoid excessive
+      // computation and memory demands when we have to resize the vector.
+
+      for (; !example_reader.Done(); example_reader.Next())
+        egs.push_back(std::pair<std::string, NnetDiscriminativeExample*>(
+            example_reader.Key(),
+            new NnetDiscriminativeExample(example_reader.Value())));
+
+      std::random_shuffle(egs.begin(), egs.end());
+    } else {
+      KALDI_ASSERT(buffer_size > 0);
+      egs.resize(buffer_size,
+                 std::pair<std::string, NnetDiscriminativeExample*>("", NULL));
+      for (; !example_reader.Done(); example_reader.Next()) {
+        int32 index = RandInt(0, buffer_size - 1);
+        if (egs[index].second == NULL) {
+          egs[index] = std::pair<std::string, NnetDiscriminativeExample*>(
+              example_reader.Key(),
+              new NnetDiscriminativeExample(example_reader.Value()));
+        } else {
+          example_writer.Write(egs[index].first, *(egs[index].second));
+          egs[index].first = example_reader.Key();
+          *(egs[index].second) = example_reader.Value();
+          num_done++;
+        }
+      }
+    }
+    for (size_t i = 0; i < egs.size(); i++) {
+      if (egs[i].second != NULL) {
+        example_writer.Write(egs[i].first, *(egs[i].second));
+        delete egs[i].second;
+        num_done++;
+      }
+    }
+
+    KALDI_LOG << "Shuffled order of " << num_done
+              << " neural-network training examples "
+              << (buffer_size ? "using a buffer (partial randomization)" : "");
+
+    return (num_done == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-subset-egs.cc b/src/nnet3bin/nnet3-discriminative-subset-egs.cc
new file mode 100644
index 00000000000..0cbeb8d0ead
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-subset-egs.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-subset-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Creates a random subset of the input examples, of a specified size.\n"
+        "Uses no more memory than the size of the subset.\n"
+        "\n"
+        "Usage:  nnet3-subset-egs [options] <egs-rspecifier> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.egs\n";
+    
+    int32 srand_seed = 0;
+    int32 n = 1000;
+    bool randomize_order = true;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("n", &n, "Number of examples to output");
+    po.Register("randomize-order", &randomize_order, "If true, randomize the order "
+                "of the output");
+    
+    po.Read(argc, argv);
+    
+    srand(srand_seed);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    std::vector<std::pair<std::string, NnetDiscriminativeExample> > egs;
+    egs.reserve(n);
+    
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int64 num_read = 0;
+    for (; !example_reader.Done(); example_reader.Next()) {
+      num_read++;
+      if (num_read <= n) {
+        egs.resize(egs.size() + 1);
+        egs.back().first = example_reader.Key();
+        egs.back().second = example_reader.Value();
+      } else {
+        BaseFloat keep_prob = n / static_cast<BaseFloat>(num_read);
+        if (WithProb(keep_prob)) { // With probability "keep_prob"
+          int32 index = RandInt(0, n-1);
+          egs[index].first = example_reader.Key();
+          egs[index].second = example_reader.Value();
+        }
+      }
+    }
+    if (randomize_order)
+      std::random_shuffle(egs.begin(), egs.end());
+
+    NnetDiscriminativeExampleWriter writer(examples_wspecifier);
+    for (size_t i = 0; i < egs.size(); i++) {
+      writer.Write(egs[i].first, egs[i].second);
+    }
+    
+    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
+              << " neural-network training examples ";
+    
+    return (num_read != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-train.cc b/src/nnet3bin/nnet3-discriminative-train.cc
new file mode 100644
index 00000000000..6d32b13d53d
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-train.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3 neural network parameters with discriminative sequence objective \n"
+        "gradient descent.  Minibatches are to be created by nnet3-discriminative-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-discriminative-train [options] <nnet-in> <discriminative-training-examples-in> <raw-nnet-out>\n"
+        "\n"
+        "nnet3-discriminative-train 1.mdl 'ark:nnet3-merge-egs 1.degs ark:-|' 2.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetDiscriminativeTrainingOptions opts;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        model_wxfilename = po.GetArg(3);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    bool binary;
+    Input ki(model_rxfilename, &binary);
+    
+    tmodel.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    
+    Nnet nnet = am_nnet.GetNnet();
+    const VectorBase<BaseFloat> &priors = am_nnet.Priors();
+
+    NnetDiscriminativeTrainer trainer(opts, tmodel, priors, &nnet);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      trainer.Train(example_reader.Value());
+
+    bool ok = trainer.PrintTotalStats();
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    Output ko(model_wxfilename, binary_write);
+    nnet.Write(ko.Stream(), binary_write);
+    
+    KALDI_LOG << "Wrote raw nnet model to " << model_wxfilename;
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 75f264f1ceb..f6972351f83 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -192,13 +192,17 @@ int main(int argc, char *argv[]) {
         KALDI_WARN << "No pdf-level posterior for key " << key;
         num_err++;
       } else {
-        const Posterior &pdf_post = pdf_post_reader.Value(key);
-        if (pdf_post.size() != feats.NumRows()) {
+        Posterior pdf_post = pdf_post_reader.Value(key);
+        if (abs(static_cast<int32>(pdf_post.size()) - feats.NumRows()) > length_tolerance
+            || pdf_post.size() < feats.NumRows()) {
           KALDI_WARN << "Posterior has wrong size " << pdf_post.size()
                      << " versus " << feats.NumRows();
           num_err++;
           continue;
         }
+        while (static_cast<int32>(pdf_post.size()) > feats.NumRows()) {
+          pdf_post.pop_back();
+        }
         const Matrix<BaseFloat> *ivector_feats = NULL;
         if (!ivector_rspecifier.empty()) {
           if (!ivector_reader.HasKey(key)) {
diff --git a/src/nnet3bin/nnet3-modify-learning-rates.cc b/src/nnet3bin/nnet3-modify-learning-rates.cc
new file mode 100644
index 00000000000..89e14a5e819
--- /dev/null
+++ b/src/nnet3bin/nnet3-modify-learning-rates.cc
@@ -0,0 +1,186 @@
+// nnet3bin/nnet3-modify-learning-rates.cc
+
+// Copyright 2013  Guoguo Chen
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This program modifies the learning rates so as to equalize the\n"
+        "relative changes in parameters for each layer, while keeping their\n"
+        "geometric mean the same (or changing it to a value specified using\n"
+        "the --average-learning-rate option).\n"
+        "\n"
+        "Usage: nnet3-modify-learning-rates [options] <prev-model> \\\n"
+        "                                  <cur-model> <modified-cur-model>\n"
+        "e.g.: nnet-modify-learning-rates --average-learning-rate=0.0002 \\\n"
+        "                                 5.mdl 6.mdl 6.mdl\n";
+
+    bool binary_write = true;
+    bool retroactive = false;
+    BaseFloat average_learning_rate = 0.0;
+    BaseFloat first_layer_factor = 1.0;
+    BaseFloat last_layer_factor = 1.0;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("average-learning-rate", &average_learning_rate,
+                "If supplied, change learning rate geometric mean to the given "
+                "value.");
+    po.Register("first-layer-factor", &first_layer_factor, "Factor that "
+                "reduces the target relative learning rate for first layer.");
+    po.Register("last-layer-factor", &last_layer_factor, "Factor that "
+                "reduces the target relative learning rate for last layer.");
+    po.Register("retroactive", &retroactive, "If true, scale the parameter "
+                "differences as well.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    KALDI_ASSERT(average_learning_rate >= 0);
+
+    std::string prev_nnet_rxfilename = po.GetArg(1),
+        cur_nnet_rxfilename = po.GetArg(2),
+        modified_cur_nnet_rxfilename = po.GetOptArg(3);
+
+    TransitionModel trans_model;
+    Nnet prev_nnet, cur_nnet;
+    {
+      bool binary_read;
+      Input ki(prev_nnet_rxfilename, &binary_read);
+      prev_nnet.Read(ki.Stream(), binary_read);
+    }
+    {
+      bool binary_read;
+      Input ki(cur_nnet_rxfilename, &binary_read);
+      cur_nnet.Read(ki.Stream(), binary_read);
+    }
+
+    int32 ret = 0;
+
+    // Get info about magnitude of parameter change.
+    Nnet diff_nnet(prev_nnet);
+    AddNnet(cur_nnet, -1.0, &diff_nnet);
+    int32 num_updatable = NumUpdatableComponents(diff_nnet);
+    Vector<BaseFloat> dot_prod(num_updatable);
+    ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
+    dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
+    KALDI_LOG << "Parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    Vector<BaseFloat> baseline_prod(num_updatable);
+    ComponentDotProducts(prev_nnet, prev_nnet, &baseline_prod);
+    baseline_prod.ApplyPow(0.5);
+    dot_prod.DivElements(baseline_prod);
+    KALDI_LOG << "Relative parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    // If relative parameter difference for a certain is zero, set it to the
+    // mean of the rest values.
+    int32 num_zero = 0;
+    for (int32 i = 0; i < num_updatable; i++) {
+      if (dot_prod(i) == 0.0) {
+        num_zero++;
+      }
+    }
+    
+    if (num_zero > 0) {
+      BaseFloat average_diff = dot_prod.Sum()
+        / static_cast<BaseFloat>(num_updatable - num_zero);
+      for (int32 i = 0; i < num_updatable; i++) {
+        if (dot_prod(i) == 0.0) {
+          dot_prod(i) = average_diff;
+        }
+      }
+      KALDI_LOG << "Zeros detected in the relative parameter difference "
+        << "vector, updating the vector to " << dot_prod ;
+    }
+
+    // Gets learning rates for previous neural net.
+    Vector<BaseFloat> prev_nnet_learning_rates(num_updatable),
+                      cur_nnet_learning_rates(num_updatable);
+    GetLearningRates(prev_nnet, &prev_nnet_learning_rates);
+    GetLearningRates(cur_nnet, &cur_nnet_learning_rates);
+    KALDI_LOG << "Learning rates for previous model per layer are "
+              << prev_nnet_learning_rates;
+    KALDI_LOG << "Learning rates for current model per layer are "
+              << cur_nnet_learning_rates;
+    
+    // Gets target geometric mean.
+    BaseFloat target_geometric_mean = 0.0; 
+    if (average_learning_rate == 0.0) {
+      target_geometric_mean = Exp(cur_nnet_learning_rates.SumLog()
+                                  / static_cast<BaseFloat>(num_updatable));
+    } else {
+      target_geometric_mean = average_learning_rate;
+    }
+    KALDI_ASSERT(target_geometric_mean > 0.0);
+
+    // Works out the new learning rates.  We start from the previous model;
+    // this ensures that if this program is run twice, we get consistent
+    // results even if it's overwritten the current model.
+    Vector<BaseFloat> nnet_learning_rates(prev_nnet_learning_rates);
+    nnet_learning_rates.DivElements(dot_prod);
+    KALDI_ASSERT(last_layer_factor > 0.0);
+    nnet_learning_rates(num_updatable - 1) *= last_layer_factor;
+    KALDI_ASSERT(first_layer_factor > 0.0);
+    nnet_learning_rates(0) *= first_layer_factor;
+    BaseFloat cur_geometric_mean = Exp(nnet_learning_rates.SumLog()
+                                 / static_cast<BaseFloat>(num_updatable));
+    nnet_learning_rates.Scale(target_geometric_mean / cur_geometric_mean);
+    KALDI_LOG << "New learning rates for current model per layer are "
+              << nnet_learning_rates;
+
+    // Changes the parameter differences if --retroactivate is set to true.
+    if (retroactive) {
+      Vector<BaseFloat> scale_factors(nnet_learning_rates);
+      scale_factors.DivElements(prev_nnet_learning_rates);
+      AddNnet(prev_nnet, -1.0, &cur_nnet);
+      ScaleNnetComponents(scale_factors, &cur_nnet);
+      AddNnet(prev_nnet, 1.0, &cur_nnet);
+      KALDI_LOG << "Scale parameter difference retroactively. Scaling factors "
+                << "are " << scale_factors;
+    }
+
+    // Sets learning rates and writes updated model.
+    SetLearningRates(nnet_learning_rates, &cur_nnet);
+
+    Output ko(modified_cur_nnet_rxfilename, binary_write);
+    cur_nnet.Write(ko.Stream(), binary_write);
+
+    return ret;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index 3251d93b5dd..9d809aa69ae 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -55,6 +55,8 @@ int main(int argc, char *argv[]) {
 
     g_num_threads = 8;
     bool repeat = false;
+    int32 length_tolerance = 0;
+    std::string frame_weights_rspecifier;
     
     po.Register("num-threads", &g_num_threads,
                 "Number of threads to use for computing derived variables "
@@ -62,6 +64,12 @@ int main(int argc, char *argv[]) {
     po.Register("repeat", &repeat,
                 "If true, output the same number of iVectors as input frames "
                 "(including repeated data).");
+    po.Register("frame-weights-rspecifier", &frame_weights_rspecifier,
+                "Archive of frame weights to scale stats");
+    po.Register("length-tolerance", &length_tolerance,
+                "Tolerance on the difference in number of frames "
+                "for feats and weights");
+
     po.Read(argc, argv);
     
     if (po.NumArgs() != 3) {
@@ -82,9 +90,9 @@ int main(int argc, char *argv[]) {
     
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
     BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
     
-    
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -105,6 +113,27 @@ int main(int argc, char *argv[]) {
                                              &matrix_feature);
         
         ivector_feature.SetAdaptationState(adaptation_state);
+         
+        if (!frame_weights_rspecifier.empty()) {
+          if (!frame_weights_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find weights for utterance " << utt;
+            num_err++;
+            continue;
+          }
+          const Vector<BaseFloat> &weights = frame_weights_reader.Value(utt);
+
+          if (std::abs(weights.Dim() - feats.NumRows()) > length_tolerance) {
+            num_err++;
+            continue;
+          }
+
+          std::vector<std::pair<int32, BaseFloat> > frame_weights;
+          for (int32 i = 0; i < std::min(weights.Dim(), feats.NumRows()); i++) {
+            frame_weights.push_back(std::make_pair(i, weights(i)));
+          }
+
+          ivector_feature.UpdateFrameWeights(frame_weights);
+        }
 
         int32 T = feats.NumRows(),
             n = (repeat ? 1 : ivector_config.ivector_period),
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index d37e4d2d203..9ffe86b51a3 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -292,6 +292,29 @@ struct CompareFirstMemberOfPair {
   }
 };
 
+/// Comparator object that does comparison on a different
+/// stl vector
+template <typename T> 
+class OtherStlVectorComparator {
+  public:
+    OtherStlVectorComparator(const std::vector<T> &vec, 
+                             bool descending = false) 
+      : vec_(vec), descending_(descending) { }
+
+    bool operator() (size_t a, size_t b) {
+      if (descending_) return vec_[a] > vec_[b];
+      else return vec_[a] < vec_[b];
+    }
+
+    inline void SetDescending() { descending_ = true; }
+    inline void SetAscending() { descending_ = false; }
+
+  private:
+    const std::vector<T> &vec_;
+    bool descending_;
+}; 
+
+
 /// For a vector of pair<I, F> where I is an integer and F a floating-point or
 /// integer type, this function sorts a vector of type vector<pair<I, F> > on
 /// the I value and then merges elements with equal I values, summing these over
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
index e142341f5ba..858a61160fa 100644
--- a/tools/extras/openfstwin-1.3.4.patch
+++ b/tools/extras/openfstwin-1.3.4.patch
@@ -1,425 +1,425 @@
-diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
-index 5ad3b52..d9c0ca6 100644
---- a/src/include/fst/fst.h
-+++ b/src/include/fst/fst.h
-@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
- 
- namespace fst {
- 
-+	typedef ::int64 int64;
-+	typedef ::uint64 uint64;
-+	typedef ::int32 int32;
-+	typedef ::uint32 uint32;
-+
-+
- bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
- 
- class FstHeader;
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)
+diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
+index 5ad3b52..d9c0ca6 100644
+--- a/src/include/fst/fst.h
++++ b/src/include/fst/fst.h
+@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
+ 
+ namespace fst {
+ 
++	typedef ::int64 int64;
++	typedef ::uint64 uint64;
++	typedef ::int32 int32;
++	typedef ::uint32 uint32;
++
++
+ bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
+ 
+ class FstHeader;
+diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
+index c4362f2..58cad44 100644
+--- a/src/include/fst/interval-set.h
++++ b/src/include/fst/interval-set.h
+@@ -37,38 +37,38 @@ template <typename T>
+ class IntervalSet {
+  public:
+   struct Interval {
+-    T begin;
+-    T end;
++    T begin_;
++    T end_;
+ 
+-    Interval() : begin(-1), end(-1) {}
++    Interval() : begin_(-1), end_(-1) {}
+ 
+-    Interval(T b, T e) : begin(b), end(e) {}
++    Interval(T b, T e) : begin_(b), end_(e) {}
+ 
+     bool operator<(const Interval &i) const {
+-      return begin < i.begin || (begin == i.begin && end > i.end);
++      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
+     }
+ 
+     bool operator==(const Interval &i) const {
+-      return begin == i.begin && end == i.end;
++      return begin_ == i.begin_ && end_ == i.end_;
+     }
+ 
+     bool operator!=(const Interval &i) const {
+-      return begin != i.begin || end != i.end;
++      return begin_ != i.begin_ || end_ != i.end_;
+     }
+ 
+     istream &Read(istream &strm) {
+       T n;
+       ReadType(strm, &n);
+-      begin = n;
++      begin_ = n;
+       ReadType(strm, &n);
+-      end = n;
++      end_ = n;
+       return strm;
+     }
+ 
+     ostream &Write(ostream &strm) const {
+-      T n = begin;
++      T n = begin_;
+       WriteType(strm, n);
+-      n = end;
++      n = end_;
+       WriteType(strm, n);
+       return strm;
+     }
+@@ -108,7 +108,7 @@ class IntervalSet {
+         lower_bound(intervals_.begin(), intervals_.end(), interval);
+     if (lb == intervals_.begin())
+       return false;
+-    return (--lb)->end > value;
++    return (--lb)->end_ > value;
+   }
+ 
+   // Requires intervals be normalized.
+@@ -123,7 +123,7 @@ class IntervalSet {
+ 
+   bool Singleton() const {
+     return intervals_.size() == 1 &&
+-        intervals_[0].begin + 1 == intervals_[0].end;
++        intervals_[0].begin_ + 1 == intervals_[0].end_;
+   }
+ 
+ 
+@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
+   T size = 0;
+   for (T i = 0; i < intervals_.size(); ++i) {
+     Interval &inti = intervals_[i];
+-    if (inti.begin == inti.end)
++    if (inti.begin_ == inti.end_)
+       continue;
+     for (T j = i + 1; j < intervals_.size(); ++j) {
+       Interval &intj = intervals_[j];
+-      if (intj.begin > inti.end)
++      if (intj.begin_ > inti.end_)
+         break;
+-      if (intj.end > inti.end)
+-        inti.end = intj.end;
++      if (intj.end_ > inti.end_)
++        inti.end_ = intj.end_;
+       ++i;
+     }
+-    count_ += inti.end - inti.begin;
++    count_ += inti.end_ - inti.begin_;
+     intervals_[size++] = inti;
+   }
+   intervals_.resize(size);
+@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
+   oset->count_ = 0;
+ 
+   while (it1 != intervals_.end() && it2 != iintervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       Interval interval;
+-      interval.begin = max(it1->begin, it2->begin);
+-      interval.end = min(it1->end, it2->end);
++      interval.begin_ = max(it1->begin_, it2->begin_);
++      interval.end_ = min(it1->end_, it2->end_);
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
+-      if (it1->end < it2->end)
++      oset->count_ += interval.end_ - interval.begin_;
++      if (it1->end_ < it2->end_)
+         ++it1;
+       else
+         ++it2;
+@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
+   oset->count_ = 0;
+ 
+   Interval interval;
+-  interval.begin = 0;
++  interval.begin_ = 0;
+   for (typename vector<Interval>::const_iterator it = intervals_.begin();
+        it != intervals_.end();
+        ++it) {
+-    interval.end = min(it->begin, maxval);
+-    if (interval.begin < interval.end) {
++    interval.end_ = min(it->begin_, maxval);
++    if (interval.begin_ < interval.end_) {
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
++      oset->count_ += interval.end_ - interval.begin_;
+     }
+-    interval.begin = it->end;
++    interval.begin_ = it->end_;
+   }
+-  interval.end = maxval;
+-  if (interval.begin < interval.end) {
++  interval.end_ = maxval;
++  if (interval.begin_ < interval.end_) {
+     ointervals->push_back(interval);
+-    oset->count_ += interval.end - interval.begin;
++    oset->count_ += interval.end_ - interval.begin_;
+   }
+ }
+ 
+@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
+     oset->count_ = 0;
+   } else {
+     IntervalSet<T> cset;
+-    iset.Complement(intervals_.back().end, &cset);
++    iset.Complement(intervals_.back().end_, &cset);
+     Intersect(cset, oset);
+   }
+ }
+@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       return true;
+@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
+   bool overlap = false; // point in both intervals_ and intervals
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       only1 = true;
+       ++it1;
+-    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
++    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
+       only2 = true;
+       ++it2;
+-    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
++    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
+       overlap = true;
+       ++it1;
+       ++it2;
+-    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
++    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
+       only2 = true;
+       overlap = true;
+       ++it1;
+-    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
++    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
+       only1 = true;
+       overlap = true;
+       ++it2;
+@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       ++it1;
+-    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
++    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
+       return false;
+-    } else if (it2->end == it1->end) {
++    } else if (it2->end_ == it1->end_) {
+       ++it1;
+       ++it2;
+     } else {
+@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
+        ++it) {
+     if (it != intervals->begin())
+       strm << ",";
+-    strm << "[" << it->begin << "," << it->end << ")";
++    strm << "[" << it->begin_ << "," << it->end_ << ")";
+   }
+   strm << "}";
+   return strm;
+diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
+index a7c3360..491ef7d 100644
+--- a/src/include/fst/label-reachable.h
++++ b/src/include/fst/label-reachable.h
+@@ -359,9 +359,9 @@ class LabelReachable {
+                iiter = intervals->begin();
+            iiter != intervals->end(); ++iiter) {
+         begin_low = LowerBound(aiter, end_low, aiter_end,
+-                               aiter_input, iiter->begin);
++                               aiter_input, iiter->begin_);
+         end_low = LowerBound(aiter, begin_low, aiter_end,
+-                             aiter_input, iiter->end);
++                             aiter_input, iiter->end_);
+         if (end_low - begin_low > 0) {
+           if (reach_begin_ < 0)
+             reach_begin_ = begin_low;
+diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
+index 3fbe3ba..6e9dd3d 100644
+--- a/src/include/fst/minimize.h
++++ b/src/include/fst/minimize.h
+@@ -134,7 +134,14 @@ class CyclicMinimizer {
+   typedef typename A::Weight Weight;
+   typedef ReverseArc<A> RevA;
+ 
+-  CyclicMinimizer(const ExpandedFst<A>& fst) {
++  CyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      P_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // this minimization
++    // algorithm for non-deterministic FSTs can only work with idempotent
++    // semirings.
+     Initialize(fst);
+     Compute(fst);
+   }
+@@ -315,7 +322,13 @@ class AcyclicMinimizer {
+   typedef typename A::StateId ClassId;
+   typedef typename A::Weight Weight;
+ 
+-  AcyclicMinimizer(const ExpandedFst<A>& fst) {
++  AcyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      partition_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // minimization for
++    // non-deterministic FSTs can only work with idempotent semirings.
+     Initialize(fst);
+     Refine(fst);
+   }
+@@ -531,13 +544,7 @@ template <class A>
+ void Minimize(MutableFst<A>* fst,
+               MutableFst<A>* sfst = 0,
+               float delta = kDelta) {
+-  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
+-                                 kWeighted | kUnweighted, true);
+-  if (!(props & kIDeterministic)) {
+-    FSTERROR() << "FST is not deterministic";
+-    fst->SetProperties(kError, kError);
+-    return;
+-  }
++  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
+ 
+   if (!(props & kAcceptor)) {  // weighted transducer
+     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
+diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
+index dcee67b..40b849a 100644
+--- a/src/include/fst/partition.h
++++ b/src/include/fst/partition.h
+@@ -43,8 +43,8 @@ class Partition {
+   friend class PartitionIterator<T>;
+ 
+   struct Element {
+-   Element() : value(0), next(0), prev(0) {}
+-   Element(T v) : value(v), next(0), prev(0) {}
++    Element() : value(0), next(0), prev(0) {}
++    Element(T v) : value(v), next(0), prev(0) {}
+ 
+    T        value;
+    Element* next;
+@@ -52,9 +52,11 @@ class Partition {
+   };
+ 
+  public:
+-  Partition() {}
++  Partition(bool allow_repeated_split):
++      allow_repeated_split_(allow_repeated_split) {}
+ 
+-  Partition(T num_states) {
++  Partition(bool allow_repeated_split, T num_states):
++      allow_repeated_split_(allow_repeated_split) {
+     Initialize(num_states);
+   }
+ 
+@@ -137,16 +139,16 @@ class Partition {
+     if (class_size_[class_id] == 1) return;
+ 
+     // first time class is split
+-    if (split_size_[class_id] == 0)
++    if (split_size_[class_id] == 0) { 
+       visited_classes_.push_back(class_id);
+-
++      class_split_[class_id] = classes_[class_id];
++    }
+     // increment size of split (set of element at head of chain)
+     split_size_[class_id]++;
+-
++    
+     // update split point
+-    if (class_split_[class_id] == 0)
+-      class_split_[class_id] = classes_[class_id];
+-    if (class_split_[class_id] == elements_[element_id])
++    if (class_split_[class_id] != 0
++        && class_split_[class_id] == elements_[element_id])
+       class_split_[class_id] = elements_[element_id]->next;
+ 
+     // move to head of chain in same class
+@@ -157,24 +159,31 @@ class Partition {
+   // class indices of the newly created class. Returns the new_class id
+   // or -1 if no new class was created.
+   T SplitRefine(T class_id) {
++
++    Element* split_el = class_split_[class_id];
+     // only split if necessary
+-    if (class_size_[class_id] == split_size_[class_id]) {
+-      class_split_[class_id] = 0;
++    //if (class_size_[class_id] == split_size_[class_id]) {
++    if(split_el == NULL) { // we split on everything...
+       split_size_[class_id] = 0;
+       return -1;
+     } else {
+-
+       T new_class = AddClass();
++
++      if(allow_repeated_split_) { // split_size_ is possibly
++        // inaccurate, so work it out exactly.
++        size_t split_count;  Element *e;
++        for(split_count=0,e=classes_[class_id];
++            e != split_el; split_count++, e=e->next);
++        split_size_[class_id] = split_count;
++      }
+       size_t remainder = class_size_[class_id] - split_size_[class_id];
+       if (remainder < split_size_[class_id]) {  // add smaller
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = split_el;
+-        class_size_[class_id] = split_size_[class_id];
+-        class_size_[new_class] = remainder;
+         split_el->prev->next = 0;
+         split_el->prev = 0;
++        class_size_[class_id] = split_size_[class_id];
++        class_size_[new_class] = remainder;
+       } else {
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = classes_[class_id];
+         class_size_[class_id] = remainder;
+         class_size_[new_class] = split_size_[class_id];
+@@ -245,10 +254,16 @@ class Partition {
+   vector<T> class_size_;
+ 
+   // size of split for each class
++  // in the nondeterministic case, split_size_ is actually an upper
++  // bound on the size of split for each class.
+   vector<T> split_size_;
+ 
+   // set of visited classes to be used in split refine
+   vector<T> visited_classes_;
++
++  // true if input fst was deterministic: we can make
++  // certain assumptions in this case that speed up the algorithm.
++  bool allow_repeated_split_;
+ };
+ 
+ 
+diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
+index 6d0c971..1da922e 100644
+--- a/src/include/fst/state-reachable.h
++++ b/src/include/fst/state-reachable.h
+@@ -112,7 +112,7 @@ class IntervalReachVisitor {
+   void FinishState(StateId s, StateId p, const A *arc) {
+     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
+       vector<Interval> *intervals = (*isets_)[s].Intervals();
+-      (*intervals)[0].end = index_;      // Update tree interval end
++      (*intervals)[0].end_ = index_;      // Update tree interval end
+     }
+     (*isets_)[s].Normalize();
+     if (p != kNoStateId)
diff --git a/windows/INSTALL b/windows/INSTALL
deleted file mode 100644
index d743129498b..00000000000
--- a/windows/INSTALL
+++ /dev/null
@@ -1,146 +0,0 @@
-
-# Installation instructions for native Windows with Visual
-# studio (for cygwin installation, see the instructions 
-# in ../INSTALL).
-
-#NOTE: These instructions are valid June 2015, MKL and OpenBLAS are supported
-#NOTE: ATLAS is not supported and I personally have no intention to work on supporting
-#      it, as it requires whole cygwin environment
-#NOTE: We now (20150613) support CUDA on Windows as well. The build was 
-#      tested on CUDA 7.0. It is possible that the compilation fails
-#      for significantly older CUDA SDK (less than, say, 5.0)
-#      Please not that CUDA support for windows is not really that usefull,
-#      because, the speed benefit during decoding is not large. And for training
-#      one would have to re-implement the while training pipeline (as the 
-#      bash script wouldn't most probably work) 
-#NOTE: While the 32bit project files will still be generated, we don't really
-#      care if they work or not. They will be removed in the near future.
-#NOTE: The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
-#NOTE: We support only openfst-1.3.x for now.
-#NOTE: I suggest to have git installed -- not only because we will 
-#      use it to download the source codes (you could download archives
-#      instead of it), but also because the windows version comes
-#      with a bunch of useful utilities. 
-#NOTE: The examples will assume you have installed the git for windows
-#      and during the installation you chose the GIT Shell to install as well.
-#      Moreover, all the commands are issued from the same session
-
-1) Checkout Kaldi trunk, either using the svn from the url
-   https://svn.code.sf.net/p/kaldi/code/trunk
-   or using git from 
-   https://github.com/kaldi-asr/kaldi.git
-   Example:
-     $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
-
-2) enter the (kaldi)/tools directory in the freshly 
-   checked-out kaldi repo. All following actions should
-   be taken in the tools dir
-   Example:
-     $ cd (kaldi)/tools
-	 (kaldi)/tools$ pwd
-   
-   
-2a) Use git to clone the OpenFST(win) from
-   https://github.com/jtrmal/openfstwin-1.3.4.git
-   Example:
-     (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
-   
-   
-2b) Download pthread-win32 (or wget or curl)
-   https://sourceforge.net/projects/pthreads4w/
-     (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
-	 (kaldi)/tools$ mkdir pthreads; cd pthreads
-	 (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
-
-2c) Use patch (or you can use git patch) to patch the OpenFST(win)
-   patch location tools/extras/openfstwin-1.3.4.patch,
-   Example:
-     (kaldi)/tools$ cd openfst
-	 (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch 
-
-2d-1) Download the OpenBLAS binary packages
-      https://sourceforge.net/projects/openblas
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
-	  (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ unzip mingw64_dll.zip
-	  
-	  NOTE: Be carefull to download "Win64-int32" and not "Win64-int64"!
-	  
-2d-2) Install MKL
-2e) If you want enabled CUDA support, download and install NVidia CUDA SDK.
-    Be careful and strive for as standard install as possible. The installer
-	set certain environment variables on which the MSVC Build rules rely.
-	If you call "set" in the command line, you should see:
-    
-	(kaldi)/tools $ set | grep CUDA
-     CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
-     CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
-	
-    The first one (CUDA_PATH) is particulary important.
- 	
-3)  Open the OpenFST solution in VS -- 
-   for VS 2013, the correct solution is in VS2012 directory
-   for VS 2014, the correct solution is in VS2014 directory
-   !!!switch the configuration to debug|x64 and build the solution
-   !!!The same for configuration release|x64
-   If either of the two won't build, you should stop here and start figuring what's different!
-
-4)  Enter the (kaldi)/windows directory
-   Example:
-	 (kaldi)/tools/openfst$ cd ../../windows
-	 (kaldi)/windows $ pwd
-	 
-4a) modify the file variables.props to reflect 
-    the correct paths, using your favorite text editor.
-	Don't worry, it's a text file, even though you have to be 
-	careful to keep the structure itself intact
-	(kaldi)/windows $ vim variables.props
-	
-	If you plan to use MKL, you can ignore the OPENBLASDIR path
-	If you plan to use OpenBLAS, you can ignore the MKLDIR path
-	No matter what you plan to use, set both the OPENFST* and PTHREADW
-	variables correctly
-	
-4b-1) For OpenBLAS support, copy the file "kaldiwin_openblas.props" to "kaldiwin.props"
-4b-2) For MKL support, you don't have to do anything, it should work out of the box. 
-      When you need to switch from OpenBLAS to MKL, copy the "kaldiwin_mkl.props" 
-	  to "kaldiwin.props"
-
-
-4c) call the script that generates the MSVC solution
-	i.e.
-	generate_solution.pl --vsver <default|vs2013|vs2015>
-	i.e. for example
-	generate_solution.pl --vsver vs2013
-	
-	For CUDA support, add switch --enable-cuda to the command line,
-	i.e. for example
-	generate_solution.pl --vsver vs2013 --enable-cuda
-	
-5)  Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build
-   Expect 10 projects to fail, majority of them will fail because of missing include "portaudio.h"
-
-------  	
-NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
-(B) either
-   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
-  kaldiwin_atlas.props  to kaldiwin.props
-
-(D)
-If you had installed ATLAS, you next have to do this:
-[assuming you are one level above this directory]
-cd kaldiwin_vs10_auto/
-
-# type the following (these commands were done from cygwin): note that these
-# commands are a bit wasteful of disk; you could alternatively ensure that
-# [root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
-# run the binaries.
-
-mkdir -p Debug Release
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
-
-Then build the project with Visual Studio.
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
new file mode 100644
index 00000000000..6a57d3d2ee2
--- /dev/null
+++ b/windows/INSTALL.md
@@ -0,0 +1,176 @@
+
+# Installation instructions for native Windows with Visual Studio
+
+For cygwin installation, see the instructions in `../INSTALL`.
+
+## Notes
+
+* These instructions are valid June 2015, MKL and OpenBLAS are supported
+* ATLAS is not supported and I personally have no intention to work on supporting
+  it, as it requires whole cygwin environment
+* We now (20150613) support CUDA on Windows as well. The build was
+  tested on CUDA 7.0. It is possible that the compilation fails
+  for significantly older CUDA SDK (less than, say, 5.0)
+  Please not that CUDA support for windows is not really that usefull,
+  because, the speed benefit during decoding is not large. And for training
+  one would have to re-implement the while training pipeline (as the
+  bash script wouldn't most probably work)
+* While the 32bit project files will still be generated, we don't really
+  care if they work or not. They will be removed in the near future.
+* The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
+* We support only openfst-1.3.x for now.
+* I suggest to have git installed -- not only because we will
+  use it to download the source codes (you could download archives
+  instead of it), but also because the windows version comes
+  with a bunch of useful utilities.
+* The examples will assume you have installed the git for windows
+  and during the installation you chose the GIT Shell to install as well.
+  Moreover, all the commands are issued from the same session.
+
+## Steps
+
+1. Checkout Kaldi trunk, either using the svn from the url https://svn.code.sf.net/p/kaldi/code/trunk
+   or using git from https://github.com/kaldi-asr/kaldi.git
+
+   Example:
+   
+        $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
+
+2. Enter the `(kaldi)/tools` directory in the freshly
+   checked-out kaldi repo. All following actions should
+   be taken in the tools dir.
+
+   Example:
+   
+        $ cd (kaldi)/tools
+        (kaldi)/tools$ pwd
+
+3. Use git to clone the OpenFST(win) from
+       
+        https://github.com/jtrmal/openfstwin-1.3.4.git
+
+   Example:
+   
+        (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
+
+4. Download pthread-win32 (or wget or curl)
+
+   https://sourceforge.net/projects/pthreads4w/
+
+        (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
+        (kaldi)/tools$ mkdir pthreads; cd pthreads
+        (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
+
+5. Use patch (or you can use git patch) to patch the OpenFST(win).
+
+   The patch location is `tools/extras/openfstwin-1.3.4.patch`
+
+   Example:
+   
+        (kaldi)/tools$ cd openfst
+        (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch
+
+   If you get this error: `Assertion failed: hunk, file ../patch-2.5.9-src/patch.c, line 354`
+   it is because the `patch.c` file should have Windows line endings (CRLF) rather than Unix ones (LF).
+   
+There are two options to use for BLAS (linear algebra): MLK and OpenBLAS. MLK is made by Intel and is optimised
+for their processors. Unfortunately it isn't free. OpenBLAS is free alternative with similar performance.
+
+6. If using MLK, install it.
+
+7. If using OpenBLAS, download the binary packages.
+
+   https://sourceforge.net/projects/openblas
+
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
+        (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ unzip mingw64_dll.zip
+
+   **Be careful to download "Win64-int32" and not "Win64-int64"!**
+
+8. If you want enabled CUDA support, download and install NVidia CUDA SDK.
+   Be careful and strive for as standard install as possible. The installer
+   set certain environment variables on which the MSVC Build rules rely.
+   If you call "set" in the command line, you should see:
+
+        (kaldi)/tools $ set | grep CUDA
+        CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
+        CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
+
+   The first one (`CUDA_PATH`) is particularly important.
+
+9. Open the OpenFST solution in VS
+
+   * for VS 2013, the correct solution is in VS2012 directory
+   * for VS 2014, the correct solution is in VS2014 directory
+
+   **Switch the configuration to `debug|x64` and build the solution.**
+
+   **Do the same for configuration `release|x64`.**
+
+   If either of the two won't build, you should stop here and start figuring what's different!
+
+10. Enter the `(kaldi)/windows` directory
+
+    Example:
+    
+         (kaldi)/tools/openfst$ cd ../../windows
+         (kaldi)/windows $ pwd
+
+11. Modify the file `variables.props` to reflect
+    the correct paths, using your favorite text editor.
+    Don't worry, it's a text file, even though you have to be
+    careful to keep the structure itself intact
+
+         (kaldi)/windows $ vim variables.props
+
+    If you plan to use MKL, you can ignore the `OPENBLASDIR` path.
+    If you plan to use OpenBLAS, you can ignore the `MKLDIR` path.
+    No matter what you plan to use, set both the `OPENFST*` and `PTHREADW`
+    variables correctly
+
+12. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props`
+13. For MKL support, you don't have to do anything, it should work out of the box.
+    When you need to switch from OpenBLAS to MKL, copy the `kaldiwin_mkl.props`
+    to `kaldiwin.props`
+
+14. Call the script that generates the MSVC solution
+
+         generate_solution.pl --vsver <default|vs2013|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mlk]
+
+    `--enable-mlk` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MLK support.
+    CUDA is disabled by default. The default Visual Studio version is 11.0 (Visual Studio 2012).
+
+    For example, for a build supporting CUDA using OpenBLAS and VS 2015 you would run:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2015 --enable-cuda --enable-openblas
+
+15. Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build.
+   Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`
+
+------
+NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
+
+(B) either
+   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
+  kaldiwin_atlas.props  to kaldiwin.props
+
+(D)
+If you had installed ATLAS, you next have to do this:
+[assuming you are one level above this directory]
+
+    cd kaldiwin_vs10_auto/
+
+Type the following (these commands were done from cygwin): note that these
+commands are a bit wasteful of disk; you could alternatively ensure that
+[root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
+run the binaries.
+
+    mkdir -p Debug Release
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
+
+Then build the project with Visual Studio.