diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
index d9437af7e0c..53221a2bd53 100755
--- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
@@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 128 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index c0af57b4a5d..0776bc05923 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -48,6 +48,13 @@
 %WER 24.2 | 13098 94477 | 79.3 12.2 8.6 3.5 24.2 57.1 | -0.178 | exp/ihm/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires.ctm.filt.sys
 %WER 25.4 | 12643 89970 | 77.6 13.7 8.7 3.0 25.4 56.3 | -0.067 | exp/ihm/nnet3/tdnn_sp/decode_eval/ascore_12/eval_hires.ctm.filt.sys
 
+# local/nnet3/run_blstm.sh --mic ihm
+# nnet3 xent BLSTM with data cleaning
+# for d in exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
+%WER 22.3 | 13098 94494 | 80.9 11.7 7.4 3.2 22.3 55.7 | -0.618 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.5 | 12643 89962 | 80.2 12.7 7.1 2.7 22.5 53.4 | -0.476 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
 ############################################
 
 # local/chain/run_tdnn.sh --mic ihm --stage 12 &
@@ -62,3 +69,15 @@
 for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
 %WER 22.4 | 13098 94476 | 80.4 10.4 9.2 2.8 22.4 54.6 | 0.069 | exp/ihm/chain/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
 %WER 22.5 | 12643 89974 | 80.0 12.1 7.9 2.6 22.5 52.8 | 0.157 | exp/ihm/chain/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+# local/chain/multi_condition/run_tdnn.sh --mic ihm &
+# cleanup + chain TDNN model + IHM reverberated data
+# for d in exp/ihm/chain_cleaned_rvb/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 21.5 | 13098 94486 | 81.8 11.0 7.2 3.3 21.5 54.6 | 0.090 | exp/ihm/chain_cleaned_rvb/tdnn_sp_rvb_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 21.9 | 12643 89985 | 80.8 12.3 6.9 2.7 21.9 52.5 | 0.183 | exp/ihm/chain_cleaned_rvb/tdnn_sp_rvb_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+
+# local/chain/tuning/run_tdnn_lstm_1i.sh --mic ihm  --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN+LSTM model
+%WER 20.8 | 13098 94489 | 82.0 10.0 8.0 2.8 20.8 53.2 | -0.096 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_dev/ascore_11/dev_hires.ctm.filt.sys
+%WER 20.7 | 12643 89980 | 81.7 11.5 6.8 2.5 20.7 51.8 | 0.015 | exp/ihm/chain_cleaned/tdnn_lstm1i_sp_bi_ld5/decode_eval/ascore_11/eval_hires.ctm.filt.sys
diff --git a/egs/ami/s5b/RESULTS_mdm b/egs/ami/s5b/RESULTS_mdm
index da56d650e73..80eb152fc5d 100644
--- a/egs/ami/s5b/RESULTS_mdm
+++ b/egs/ami/s5b/RESULTS_mdm
@@ -65,7 +65,6 @@
 
 # cleanup + chain TDNN model, alignments from IHM data (IHM alignments help).
 # local/chain/run_tdnn.sh --mic mdm8 --use-ihm-ali true --stage 12 &
-# *** best system ***
 # for d in exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
 %WER 37.4 | 15286 94509 | 66.6 18.0 15.5 3.9 37.4 62.8 | 0.624 | exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
 %WER 40.6 | 13381 89982 | 62.7 18.9 18.3 3.3 40.6 67.6 | 0.594 | exp/mdm8/chain_cleaned/tdnn_sp_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
@@ -78,4 +77,15 @@
 %WER 37.9 | 15635 94514 | 66.5 19.1 14.4 4.4 37.9 61.2 | 0.646 | exp/mdm8/chain/tdnn_sp_bi_ihmali/decode_dev/ascore_8/dev_hires_o4.ctm.filt.sys
 %WER 41.5 | 13884 89975 | 62.3 20.3 17.4 3.8 41.5 66.0 | 0.621 | exp/mdm8/chain/tdnn_sp_bi_ihmali/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys
 
+# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned
+# cleanup + chain TDNN model, MDM original + IHM reverberated data, alignments from IHM data
+# for d in exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 35.8 | 14512 94498 | 68.2 17.2 14.6 4.0 35.8 64.9 | 0.632 | exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+%WER 39.1 | 13651 89967 | 64.3 18.4 17.3 3.3 39.1 65.2 | 0.607 | exp/mdm8/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
 
+#  local/chain/tuning/run_tdnn_lstm_1i.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+#  cleanup + chain TDNN+LSTM model, MDM audio and alignments from IHM data
+# *** best system ***
+%WER 34.6 | 15116 94508 | 69.6 17.6 12.9 4.1 34.6 62.3 | 0.687 | exp/mdm8/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+%WER 37.1 | 14343 90002 | 66.3 18.8 14.9 3.4 37.1 62.3 | 0.659 | exp/mdm8/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index cec525d797d..f0177a45078 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -46,6 +46,12 @@
 %WER 41.6 | 14493 94516 | 63.3 23.5 13.2 4.9 41.6 66.8 | 0.639 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_dev/ascore_13/dev_hires_o4.ctm.filt.sys
 %WER 46.0 | 13597 89967 | 57.5 24.9 17.6 3.6 46.0 68.1 | 0.601 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_eval/ascore_14/eval_hires_o4.ctm.filt.sys
 
+# xent BLSTM system; cleaned data and IHM alignments.
+# local/nnet3/run_blstm.sh --mic sdm1 --use-ihm-ali true
+# for d in exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
+%WER 37.8 | 14633 94518 | 67.1 22.3 10.7 4.9 37.8 64.2 | 0.745 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys
+%WER 41.4 | 13809 89628 | 62.7 24.1 13.2 4.1 41.4 65.2 | 0.723 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys
 
 # =========================
 
@@ -62,7 +68,6 @@
 # cleanup + chain TDNN model, alignments from IHM data (IHM alignments help).
 # local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --stage 12 &
 # cleanup + chain TDNN model, cleaned data and alignments from ihm data.
-# *** best system ***
 # for d in exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
 %WER 40.7 | 14321 94501 | 63.0 19.6 17.4 3.7 40.7 67.7 | 0.592 | exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
 %WER 44.8 | 14293 89976 | 58.6 21.3 20.1 3.3 44.8 64.2 | 0.559 | exp/sdm1/chain_cleaned/tdnn_sp_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
@@ -75,4 +80,16 @@
 %WER 40.7 | 14549 94520 | 63.6 21.4 15.0 4.3 40.7 66.2 | 0.617 | exp/sdm1/chain/tdnn_sp_bi_ihmali/decode_dev/ascore_8/dev_hires_o4.ctm.filt.sys
 %WER 45.1 | 13296 89971 | 59.1 23.4 17.6 4.2 45.1 69.5 | 0.591 | exp/sdm1/chain/tdnn_sp_bi_ihmali/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys
 
+# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+# cleanup + chain TDNN model, SDM original + IHM reverberated data, alignments from ihm data.
+# *** best system ***
+# for d in exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 38.6 | 14760 94502 | 65.3 19.3 15.4 3.9 38.6 64.9 | 0.599 | exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+%WER 42.7 | 14070 89982 | 60.9 21.0 18.0 3.6 42.7 64.5 | 0.571 | exp/sdm1/chain_cleaned_rvb/tdnn_sp_rvb_bi_ihmali/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
 
+# local/chain/tuning/run_tdnn_lstm_1i.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+# cleanup + chain TDNN model, SDM audio + alignments from ihm data.
+# *** best system ***
+%WER 37.6 | 15122 94495 | 66.1 18.7 15.1 3.7 37.6 63.2 | 0.646 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 40.9 | 13807 89961 | 62.4 20.0 17.6 3.3 40.9 65.7 | 0.612 | exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
diff --git a/egs/ami/s5b/local/chain/compare_wer_general.sh b/egs/ami/s5b/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..225890daf5c
--- /dev/null
+++ b/egs/ami/s5b/local/chain/compare_wer_general.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+mic=$1;
+shift;
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" $x;   done
+echo
+
+#for d in exp/sdm1/chain_cleaned/tdnn*/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done|grep eval_hires
+
+
+echo -n "WER on dev  "
+for x in $*; do
+  wer=$(grep Sum exp/$mic/chain_cleaned/${x}/decode_dev*/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval  "
+for x in $*; do
+  wer=$(grep Sum exp/$mic/chain_cleaned/${x}/decode_eval*/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/$mic/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
new file mode 100755
index 00000000000..28c9849d885
--- /dev/null
+++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+
+# This is a chain-training script with TDNN neural networks.
+# This script is based on local/chain/run_tdnn.sh, but adding
+# the reverberated IHM data into the train set.
+# This script obtains better results on both IHM and SDM tasks.
+
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+# local/chain/multi_condition/run_tdnn.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=1
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=true
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3_cleaned  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+num_data_reps=1
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! $use_ihm_ali; then
+  [ "$mic" != "ihm" ] && \
+    echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \
+    exit 1;
+else
+  [ "$mic" == "ihm" ] && \
+    echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \
+    exit 1;
+fi
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+nnet3_affix=_cleaned
+rvb_affix=_rvb
+
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_comb_lats_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi
+fi
+
+
+local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --num-data-reps $num_data_reps \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+
+train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $original_lat_dir
+  rm $original_lat_dir/fsts.*.gz # save space
+
+  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+
+  mkdir -p $lat_dir/temp/
+  mkdir -p $lat_dir/temp2/
+  lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp
+  lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp
+
+  # copy the lattices for the reverberated data
+  rm -f $lat_dir/temp/combined_lats.scp
+  touch $lat_dir/temp/combined_lats.scp
+  cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp
+  for i in `seq 1 $num_data_reps`; do
+    cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp
+  done
+  sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp
+
+  lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1;
+  echo "1" > $lat_dir/num_jobs
+
+  # copy other files from original lattice dir
+  for f in cmvn_opts final.mdl splice_opts tree; do
+    cp $original_lat_dir/$f $lat_dir/$f
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/$mic/${train_set}_sp_hires_comb \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $tree_dir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-rvb$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
deleted file mode 100755
index 8df62af8bad..00000000000
--- a/egs/ami/s5b/local/chain/run_tdnn.sh
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/bin/bash
-
-# This is a chain-training script with TDNN neural networks.
-# Please see RESULTS_* for examples of command lines invoking this script.
-
-
-# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone
-# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1
-
-# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
-
-# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 &
-# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
-
-# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned&
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-mic=ihm
-nj=30
-min_seg_len=1.55
-use_ihm_ali=false
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --mic $mic \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-# Note: the first stage of the following script is stage 8.
-local/nnet3/prepare_lores_feats.sh --stage $stage \
-                                   --mic $mic \
-                                   --nj $nj \
-                                   --min-seg-len $min_seg_len \
-                                   --use-ihm-ali $use_ihm_ali \
-                                   --train-set $train_set
-
-if $use_ihm_ali; then
-  gmm_dir=exp/ihm/${ihm_gmm}
-  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
-  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
-  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
-  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
-  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
-  # note: the distinction between when we use the 'ihmdata' suffix versus
-  # 'ihmali' is pretty arbitrary.
-else
-  gmm_dir=exp/${mic}/$gmm
-  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
-  lores_train_data_dir=data/$mic/${train_set}_sp_comb
-  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
-  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-fi
-
-train_data_dir=data/$mic/${train_set}_sp_hires_comb
-train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-final_lm=`cat data/local/lm/final_lm`
-LM=$final_lm.pr1-7
-
-
-for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
-   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-
-if [ $stage -le 11 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
-fi
-
-[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 13 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 14 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 15 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/$mic/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 450 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize 0.1 \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
-fi
-
-if [ $stage -le 16 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
-  fi
-
- touch $dir/egs/.nodelete # keep egs around when that run dies.
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-
-graph_dir=$dir/graph_${LM}
-if [ $stage -le 17 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
-fi
-
-if [ $stage -le 18 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in dev eval; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $nj --cmd "$decode_cmd" \
-          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/ami/s5b/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..8df62af8bad
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+
+# This is a chain-training script with TDNN neural networks.
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+
+# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone
+# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1
+
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
+
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 &
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
+
+# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned&
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/$mic/${train_set}_sp_hires_comb \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $tree_dir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..a262f8e1860
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+# This is a chain-training script with TDNN neural networks.
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+
+# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali true --mic sdm1 # rerunning with biphone
+# local/nnet3/run_tdnn.sh --stage 8 --use-ihm-ali false --mic sdm1
+
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
+
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --stage 12 &
+# local/chain/run_tdnn.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" --stage 12 &
+
+# local/chain/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned&
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..64cde69e7dd
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+
+# same as 1b but with shorter minibatches
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..ba136e67521
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# TDNN+LSTM architecture similar to swbd/tdnn_lstm_1b
+# results on sdm1 with ihm ali
+#System           tdnn1b     tdnn_lstm1a
+#WER on dev        39.9      38.9
+#WER on eval        43.9      42.2
+#Final train prob      -0.186387 -0.142585
+#Final valid prob      -0.259997 -0.251197
+#Final train prob (xent)       -2.4593  -1.73176
+#Final valid prob (xent)      -2.70347  -2.26965
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..ed615a98e30
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+# same as 1a but the neural network has two more TDNN layers (0,3 0,3)
+# above the lstm
+# results on sdm1 with ihm ali
+
+#System            1a        1b
+#WER on dev        38.9      39.6
+#WER on eval        42.2      42.9
+#Final train prob      -0.142585 -0.152283
+#Final valid prob      -0.251197 -0.253287
+#Final train prob (xent)      -1.73176  -1.77542
+#Final valid prob (xent)      -2.26965  -2.28851
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1b  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(0,3) dim=512
+
+  ## adding the layers for chain branch
+  output-layer name=output input=tdnn9 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=tdnn9 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..ce719d6f2cb
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,291 @@
+#!/bin/bash
+
+# same as 1a, but with more TDNN layers between each LSTM
+# results on sdm1 with ihm ali
+#System            1a        1c
+#WER on dev        38.9      39.0
+#WER on eval        42.2      41.9
+#Final train prob      -0.142585 -0.142951
+#Final valid prob      -0.251197 -0.249901
+#Final train prob (xent)      -1.73176  -1.71779
+#Final valid prob (xent)      -2.26965  -2.22776
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1c  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
new file mode 100755
index 00000000000..22967036cb2
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+
+# same as 1c, but with more TDNN layers between each LSTM
+# results on sdm1 with ihm ali
+
+#System            tdnn_lstm1c_sp_bi_ihmali_ld5 tdnn_lstm1d_sp_bi_ihmali_ld5
+#WER on dev        39.0      39.1
+#WER on eval        41.9      42.0
+#Final train prob      -0.142951 -0.150625
+#Final valid prob      -0.249901 -0.248819
+#Final train prob (xent)      -1.71779  -1.75401
+#Final valid prob (xent)      -2.22776  -2.24072
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1d  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn10 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn11 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs false \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..6e73457a772
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+
+# same as 1c but with only right context for the TDNNs i.e., (0,3) in place
+# of (-3,0,3)
+# results on sdm1 with ihm ali
+
+#System               tdnn_lstm1c_sp_bi_ihmali_ld5tdnn_lstm1e_sp_bi_ihmali_ld5
+#WER on dev        39.0      39.4
+#WER on eval        41.9      42.4
+#Final train prob      -0.142951 -0.152498
+#Final valid prob      -0.249901 -0.251393
+#Final train prob (xent)      -1.71779  -1.77722
+#Final valid prob (xent)      -2.22776  -2.26705
+#
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(0,3) dim=512
+  relu-renorm-layer name=tdnn6 input=Append(0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn7 input=Append(0,3) dim=512
+  relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
new file mode 100755
index 00000000000..3c4df056460
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+# same as 1a but the neural network has two more TDNN layers (0,3 0,3)
+# above the lstm
+# results on sdm1 with ihm ali
+#
+#System               tdnn_lstm1a_sp_bi_ihmali_ld5tdnn_lstm1f_sp_bi_ihmali_ld5
+#WER on dev        38.9      39.4
+#WER on eval        42.2      42.7
+#Final train prob      -0.142585  -0.15514
+#Final valid prob      -0.251197 -0.253257
+#Final train prob (xent)      -1.73176  -1.80786
+#Final valid prob (xent)      -2.26965  -2.29771
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1f  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn8 input=Append(0,3) dim=512
+  tanh-layer name=tdnn9 input=Append(0,3) dim=512
+
+  ## adding the layers for chain branch
+  output-layer name=output input=tdnn9 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=tdnn9 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..cce5f2f5f3e
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+
+# same as 1c but with smaller minibatch
+# using smaller minibatches seems to be better in TDNN+LSTM archs.
+# not much difference in other archs.
+# results on sdm1 using ihm ali
+#System               tdnn_lstm1c_sp_bi_ihmali_ld5tdnn_lstm1g_sp_bi_ihmali_ld5
+#WER on dev        39.0      38.3
+#WER on eval        41.9      41.6
+#Final train prob      -0.142951 -0.138017
+#Final valid prob      -0.249901 -0.238659
+#Final train prob (xent)      -1.71779  -1.66834
+#Final valid prob (xent)      -2.22776  -2.17419
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1g  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
new file mode 100755
index 00000000000..c306849632a
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -0,0 +1,294 @@
+#!/bin/bash
+
+# same as 1c but with one more stack of TDNN and LSTM layers
+# results on sdm1 using ihm ali
+#System            tdnn_lstm1c_sp_bi_ihmali_ld5 tdnn_lstm1h_sp_bi_ihmali_ld5
+#WER on dev        39.0      39.4
+#WER on eval        41.9      42.6
+#Final train prob      -0.142951 -0.157634
+#Final valid prob      -0.249901  -0.24945
+#Final train prob (xent)      -1.71779   -1.7585
+#Final valid prob (xent)      -2.22776   -2.2512
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1h  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn10 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn11 input=Append(-3,0,3) dim=512
+  relu-renorm-layer name=tdnn12 input=Append(-3,0,3) dim=512
+  lstmp-layer name=lstm4 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
new file mode 100755
index 00000000000..3f8ff14efd9
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+# same as 1g but with TDNN output dim 1024 instead of 512
+# (num-params 1g:21309812 1i: 43447156)
+# results on sdm1 using ihm ali
+#System               tdnn_lstm1g_sp_bi_ihmali_ld5 tdnn_lstm1i_sp_bi_ihmali_ld5
+#WER on dev        38.3      37.6
+#WER on eval        41.6      40.9
+#Final train prob      -0.138017 -0.114135
+#Final valid prob      -0.238659 -0.245208
+#Final train prob (xent)      -1.66834  -1.47648
+#Final valid prob (xent)      -2.17419  -2.16365
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1i  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh
new file mode 100755
index 00000000000..eb20415e515
--- /dev/null
+++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+
+# This script is called from local/chain/multi_condition/run_tdnn.sh.
+# It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+stage=1
+mic=ihm
+nj=30
+min_seg_len=1.55  # min length in seconds... we do this because chain training
+                  # will discard segments shorter than 1.5 seconds.  Must remain in sync with
+                  # the same option given to prepare_lores_feats.sh.
+train_set=train_cleaned   # you might set this to e.g. train_cleaned.
+gmm=tri3_cleaned  # This specifies a GMM-dir from the features of the type you're training the system on;
+                  # it should contain alignments for 'train_set'.
+
+
+num_threads_ubm=32
+rvb_affix=_rvb
+nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/$mic/nnet3_cleaned or whatever.
+num_data_reps=1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+nnet3_affix=${nnet3_affix}$rvb_affix
+
+gmmdir=exp/${mic}/${gmm}
+
+
+for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 1 ] && [ -f data/$mic/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/$mic/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${mic}/${train_set} data/${mic}/${train_set}_sp
+
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/$mic/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp dev eval; do
+    utils/copy_data_dir.sh data/$mic/$datadir data/$mic/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/$mic/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp dev eval; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/$mic/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires
+    utils/fix_data_dir.sh data/$mic/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
+  # we have to combine short segments or we won't be able to train chain models
+  # on those segments.
+  utils/data/combine_short_segments.sh \
+     data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb
+
+  # just copy over the CMVN to avoid having to recompute it.
+  cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/
+  utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: creating reverberated MFCC features"
+
+  datadir=data/ihm/train_cleaned_sp
+
+  mfccdir=${datadir}_rvb${num_data_reps}_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then
+    if [ ! -d "RIRS_NOISES" ]; then
+      # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+      wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+      unzip rirs_noises.zip
+    fi
+
+    rvb_opts=()
+    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+    rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+    rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)
+
+    python steps/data/reverberate_data_dir.py \
+      "${rvb_opts[@]}" \
+      --prefix "rev" \
+      --foreground-snrs "20:10:15:5:0" \
+      --background-snrs "20:10:15:5:0" \
+      --speech-rvb-probability 1 \
+      --pointsource-noise-addition-probability 1 \
+      --isotropic-noise-addition-probability 1 \
+      --num-replications ${num_data_reps} \
+      --max-noises-per-minute 1 \
+      --source-sampling-rate 16000 \
+      ${datadir} ${datadir}_rvb${num_data_reps}
+
+    utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires
+    utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires
+
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires
+    steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires
+    utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires  
+
+    utils/data/combine_short_segments.sh \
+      ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb
+
+    # just copy over the CMVN to avoid having to recompute it.
+    cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/
+    utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/
+  fi
+
+  utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires
+  utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \
+          data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${mic}/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+  fi
+
+  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+  if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+    # we don't want to overwrite old stuff, ask the user to delete it.
+    echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+    echo " ... please delete and then rerun, or use a later --stage option."
+    exit 1;
+  fi
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     3000 10000 $temp_data_root/${train_set}_hires data/lang \
+      $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+fi
+
+
+if [ $stage -le 5 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/$mic/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/$mic/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  # we don't use the _comb data for this as there is no need for compatibility with
+  # the alignments, and using the non-combined data is more efficient for I/O
+  # (no messing about with piped commands).
+  num_utts_total=$(wc -l <data/$mic/${train_set}_sp_rvb_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/$mic/${train_set}_sp_rvb_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_rvb_hires_subset
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_rvb_hires_subset 512 \
+    exp/$mic/nnet3${nnet3_affix}/tri5 exp/$mic/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/$mic/${train_set}_sp_rvb_hires exp/$mic/nnet3${nnet3_affix}/diag_ubm exp/$mic/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+  ivectordir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_rvb_hires_comb
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${mic}/${train_set}_sp_rvb_hires_comb ${temp_data_root}/${train_set}_sp_rvb_hires_comb_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_rvb_hires_comb_max2 \
+    exp/$mic/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp) or small-segment concatenation (comb).
+  for data in dev eval; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+      data/${mic}/${data}_hires exp/$mic/nnet3${nnet3_affix}/extractor \
+      exp/$mic/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/ami/s5b/local/nnet3/run_blstm.sh b/egs/ami/s5b/local/nnet3/run_blstm.sh
new file mode 100755
index 00000000000..776151fb5aa
--- /dev/null
+++ b/egs/ami/s5b/local/nnet3/run_blstm.sh
@@ -0,0 +1,52 @@
+stage=0
+train_stage=-10
+mic=ihm
+affix=bidirectional
+common_egs_dir=
+remove_egs=true
+use_ihm_ali=false
+train_set=train_cleaned
+ihm_gmm=tri3
+nnet3_affix=_cleaned
+
+# BLSTM params
+cell_dim=512
+rp_dim=128
+nrp_dim=128
+chunk_left_context=40
+chunk_right_context=40
+
+# training options
+srand=0
+num_jobs_initial=2
+num_jobs_final=12
+samples_per_iter=20000
+num_epochs=10
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+local/nnet3/run_lstm.sh --affix $affix \
+                         --stage $stage \
+                         --srand $srand \
+                         --train-stage $train_stage \
+                         --train-set $train_set \
+                         --ihm-gmm $ihm_gmm \
+                         --nnet3-affix $nnet3_affix \
+                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+                         --label-delay 0 \
+                         --cell-dim $cell_dim \
+                         --recurrent-projection-dim $rp_dim \
+                         --non-recurrent-projection-dim $nrp_dim \
+                         --common-egs-dir "$common_egs_dir" \
+                         --chunk-left-context $chunk_left_context \
+                         --chunk-right-context $chunk_right_context \
+                         --mic $mic \
+                         --num-jobs-initial $num_jobs_initial \
+                         --num-jobs-final $num_jobs_final \
+                         --samples-per-iter $samples_per_iter \
+                         --num-epochs $num_epochs \
+                         --use-ihm-ali $use_ihm_ali \
+                         --remove-egs $remove_egs
+
diff --git a/egs/ami/s5b/local/nnet3/run_lstm.sh b/egs/ami/s5b/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..c5583e2d0ef
--- /dev/null
+++ b/egs/ami/s5b/local/nnet3/run_lstm.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+#    This is the standard "lstm" system, built in nnet3.
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+
+# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true
+
+# local/nnet3/run_lstm.sh --mic ihm --stage 11
+# local/nnet3/run_lstm.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix "" &
+#
+# local/nnet3/run_lstm.sh --mic sdm1 --stage 11 --affix cleaned2 --gmm tri4a_cleaned2 --train-set train_cleaned2 &
+
+# local/nnet3/run_lstm.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" &
+
+# local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 &
+
+#  local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" &
+
+# this is an example of how you'd train a non-IHM system with the IHM
+# alignments.  the --gmm option in this case refers to the IHM gmm that's used
+# to get the alignments.
+# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true --affix cleaned2 --gmm tri4a --train-set train_cleaned2 &
+
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+ihm_gmm=tri3      # Only relevant if $use_ihm_ali is true, the name of the gmm-dir in
+                  # the ihm directory that is to be used for getting alignments.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+train_stage=-10
+srand=0
+num_epochs=10
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=12
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+decode_iter=
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  maybe_ihm="IHM "
+  dir=exp/$mic/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+  if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+  dir=${dir}_sp_ihmali
+else
+  gmm_dir=exp/${mic}/${gmm}
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  maybe_ihm=
+  dir=exp/$mic/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+  if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+  dir=${dir}_sp
+fi
+
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$gmm_dir/graph_${LM}
+
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+         ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --trainer.rnn.num-bptt-steps 30 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  rm $dir/.error 2>/dev/null || true
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  model_opts=
+  [ ! -z $decode_iter ] && model_opts=" --iter $decode_iter ";
+  for decode_set in dev eval; do
+      (
+      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      decode_dir=${dir}/decode_${decode_set}
+      steps/nnet3/decode.sh --nj 250 --cmd "$decode_cmd" \
+          $model_opts \
+          --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
+      ) &
+  done
+  wait;
+  if [ -f $dir/.error ]; then
+    echo "$0: error detected during decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index 79d633b1ebd..e54b5f43128 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 1.414 \
     --egs.stage $get_egs_stage \
@@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --cleanup.remove-egs $remove_egs \
     --feat-dir data/train_rvb_min${min_seg_len}_hires \
     --tree-dir $treedir \
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index 5fa4ea565cd..0ca6062e9c8 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 1.414 \
     --egs.stage $get_egs_stage \
@@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --cleanup.remove-egs $remove_egs \
     --feat-dir data/train_rvb_min${min_seg_len}_hires \
     --tree-dir $treedir \
diff --git a/egs/aspire/s5/local/fisher_data_prep.sh b/egs/aspire/s5/local/fisher_data_prep.sh
index 93abf390225..233185f071e 100755
--- a/egs/aspire/s5/local/fisher_data_prep.sh
+++ b/egs/aspire/s5/local/fisher_data_prep.sh
@@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/data/links
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/data/links/$subdir
diff --git a/egs/fisher_english/s5/local/fisher_data_prep.sh b/egs/fisher_english/s5/local/fisher_data_prep.sh
index 93abf390225..233185f071e 100755
--- a/egs/fisher_english/s5/local/fisher_data_prep.sh
+++ b/egs/fisher_english/s5/local/fisher_data_prep.sh
@@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/data/links
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/data/links/$subdir
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
index b70da4e852a..d9b11f9fb21 100644
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 1.414 \
@@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/fisher_swbd/s5/local/fisher_data_prep.sh b/egs/fisher_swbd/s5/local/fisher_data_prep.sh
index dfc29c5a6c6..470577f28d3 100755
--- a/egs/fisher_swbd/s5/local/fisher_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/fisher_data_prep.sh
@@ -39,7 +39,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/data_fisher/links/$subdir
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/data_fisher/links/$subdir
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index be6c82a935e..4afa867503a 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -108,9 +108,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -136,8 +133,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index ef7e9d2594f..81732779d37 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -115,9 +115,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -143,8 +140,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/lre07/v2/local/dnn/fisher_data_prep.sh b/egs/lre07/v2/local/dnn/fisher_data_prep.sh
index c7e74dea3bc..70cede2f86c 100755
--- a/egs/lre07/v2/local/dnn/fisher_data_prep.sh
+++ b/egs/lre07/v2/local/dnn/fisher_data_prep.sh
@@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/data/links
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/data/links/$subdir
diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
index 51ca7db0495..df9b8002d0c 100644
--- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh
+++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
@@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 1.414 \
@@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/multi_en/s5/local/fisher_data_prep.sh b/egs/multi_en/s5/local/fisher_data_prep.sh
index 386fb5e111c..ae0b9683125 100755
--- a/egs/multi_en/s5/local/fisher_data_prep.sh
+++ b/egs/multi_en/s5/local/fisher_data_prep.sh
@@ -46,7 +46,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/fisher/links/$subdir
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/fisher/links/$subdir
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index 65a9840df71..ecafb588cfe 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -230,8 +230,9 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13
 
 ### chain results ###
-# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh)
-%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5
+# current best chain result with TDNN (check local/chain/run_tdnn_5g.sh)
+%WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
+%WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
 
 ### nnet1 results ###
 
diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/run_tdnn_5g.sh
new file mode 100755
index 00000000000..f6fbe070763
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn_5g.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# This is modified from run_tdnn_5f.sh, to use the old topology, as a baseline
+# to test the modified transition-model code (by which we hope to be able to
+# create more compact decoding graphs for chain models).
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_5g
+
+# training options
+num_epochs=12
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5g_tree
+lang=data/lang_chain_5g
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo_orig.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --leftmost-questions-truncate $leftmost_questions_truncate \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/train \
+    --ivector-dir exp/nnet2_online/ivectors \
+    --tree-dir $treedir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2_online/ivectors \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs true \
+    --feat-dir data/train \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test $dir/decode || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5n.sh
old mode 100644
new mode 100755
similarity index 62%
rename from egs/rm/s5/local/chain/run_tdnn_5f.sh
rename to egs/rm/s5/local/chain/run_tdnn_5n.sh
index 0379d16fe13..7fd7b82aa1d
--- a/egs/rm/s5/local/chain/run_tdnn_5f.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5n.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 
-# this script is a modified version of swbd/run_tdnn_5f.sh
+# this script is a modified version of run_tdnn_5g.sh. It uses
+# the new transition model and the python version of training scripts.
+
+
 
 set -e
 
@@ -8,7 +11,7 @@ set -e
 stage=0
 train_stage=-10
 get_egs_stage=-10
-dir=exp/chain/tdnn_5f
+dir=exp/chain/tdnn_5n
 
 # training options
 num_epochs=12
@@ -43,13 +46,13 @@ fi
 # run those things.
 
 ali_dir=exp/tri3b_ali
-treedir=exp/chain/tri4_2y_tree
-lang=data/lang_chain_2y
+treedir=exp/chain/tri4_5n_tree
+lang=data/lang_chain_5n
 
 local/online/run_nnet2_common.sh --stage $stage || exit 1;
 
 if [ $stage -le 4 ]; then
-  # Get the alignments as lattices (gives the CTC training more freedom).
+  # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
   steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
@@ -78,51 +81,73 @@ if [ $stage -le 6 ]; then
 fi
 
 if [ $stage -le 7 ]; then
-  steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale-nonlinearity 0.00001 \
+    --feat-dir data/train \
+    --ivector-dir exp/nnet2_online/ivectors \
+    --tree-dir $treedir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
     --xent-regularize 0.1 \
-    --leaky-hmm-coefficient 0.1 \
-    --l2-regularize 0.00005 \
-    --jesus-opts "--jesus-forward-input-dim 200  --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
-    --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \
-    --apply-deriv-weights false \
-    --frames-per-iter 1000000 \
-    --lm-opts "--num-extra-lm-states=200" \
-    --get-egs-stage $get_egs_stage \
-    --minibatch-size $minibatch_size \
-    --egs-opts "--frames-overlap-per-eg 0" \
-    --frames-per-eg $frames_per_eg \
-    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet2_online/ivectors \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
-    --max-param-change $max_param_change \
-    --cmd "$decode_cmd" \
-    --remove-egs $remove_egs \
-    data/train $treedir exp/tri3b_lats $dir  || exit 1;
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
 fi
 
 if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2_online/ivectors \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs true \
+    --feat-dir data/train \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
     data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
 fi
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 10 ]; then
   # Note: it might appear that this $lang directory is mismatched, and it is as
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
   utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context 20 --scoring-opts "--min-lmwt 1" \
+    --scoring-opts "--min-lmwt 1" \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
     $dir/graph data/test $dir/decode || exit 1;
 fi
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 11 ]; then
   utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context 20 \
     --nj 20 --cmd "$decode_cmd" \
     --online-ivector-dir exp/nnet2_online/ivectors_test \
     $dir/graph_ug data/test $dir/decode_ug || exit 1;
diff --git a/egs/sprakbanken_swe/s5/RESULTS b/egs/sprakbanken_swe/s5/RESULTS
new file mode 100644
index 00000000000..a133a0c2e9f
--- /dev/null
+++ b/egs/sprakbanken_swe/s5/RESULTS
@@ -0,0 +1,19 @@
+%WER 48.86 [ 34040 / 69674, 3407 ins, 7500 del, 23133 sub ] exp/mono/decode_test120_p_spk/wer_9_0.0
+
+%WER 24.16 [ 16835 / 69674, 2620 ins, 2887 del, 11328 sub ] exp/tri1/decode_test120_p_spk/wer_13_0.5
+
+%WER 23.86 [ 16623 / 69674, 2793 ins, 2576 del, 11254 sub ] exp/tri2a/decode_test120_p_spk/wer_13_0.5
+
+%WER 22.66 [ 15791 / 69674, 3016 ins, 2196 del, 10579 sub ] exp/tri2b/decode_test120_p_spk/wer_16_0.0
+
+%WER 20.19 [ 14065 / 69674, 2899 ins, 1929 del, 9237 sub ] exp/tri3b/decode_test120_p_spk/wer_17_0.0
+
+%WER 19.06 [ 13279 / 69674, 2900 ins, 1673 del, 8706 sub ] exp/tri4a/decode_test120_p_spk/wer_18_0.0
+
+——————————————————————————————————————————————————————————————————————————————————
+#full test set 
+
+%WER 18.88 [ 111453 / 590285, 25457 ins, 13698 del, 72298 sub ] exp/tri4a/decode_4g_test/wer_17_0.0
+
+%WER 15.97 [ 94242 / 590285, 21022 ins, 12697 del, 60523 sub ] exp/nnet5c/decode_4g_test/wer_11
+
diff --git a/egs/sre10/v1/local/dnn/fisher_data_prep.sh b/egs/sre10/v1/local/dnn/fisher_data_prep.sh
index c7e74dea3bc..70cede2f86c 100755
--- a/egs/sre10/v1/local/dnn/fisher_data_prep.sh
+++ b/egs/sre10/v1/local/dnn/fisher_data_prep.sh
@@ -52,7 +52,7 @@ for subdir in fe_03_p1_sph1  fe_03_p1_sph3  fe_03_p1_sph5  fe_03_p1_sph7 \
       found_subdir=true
       ln -s $dir/$subdir data/local/data/links
     else
-      new_style_subdir=$(echo $subdir | sed s/fe_03_p2_sph/fisher_eng_tr_sp_d/)
+      new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/)
       if [ -d $dir/$new_style_subdir ]; then
         found_subdir=true
         ln -s $dir/$new_style_subdir data/local/data/links/$subdir
diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh
deleted file mode 100755
index ded03563711..00000000000
--- a/egs/swbd/s5c/local/chain/compare_wer.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-
-
-echo -n "System               "
-for x in $*; do   printf "% 10s" $x;   done
-echo
-
-echo -n "WER on train_dev(tg) "
-for x in $*; do
-  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on train_dev(fg) "
-for x in $*; do
-  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on eval2000(tg)  "
-for x in $*; do
-  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on eval2000(fg)  "
-for x in $*; do
-  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "Final train prob     "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final valid prob     "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final train prob (xent)    "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final valid prob (xent)    "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..c8aae0b3b94
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" $x;   done
+echo
+
+echo -n "WER on train_dev(tg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on train_dev(fg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(tg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(fg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh
new file mode 100755
index 00000000000..542dae82581
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+models=""
+for x in $*; do   models="$models tdnn_${x}";   done
+
+local/chain/compare_wer_general.sh $models
diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh
new file mode 120000
index 00000000000..0160247619f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm.sh
@@ -0,0 +1 @@
+tuning/run_blstm_6j.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_lstm.sh b/egs/swbd/s5c/local/chain/run_lstm.sh
index 28e5e6cc20c..8b421ac2649 120000
--- a/egs/swbd/s5c/local/chain/run_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_lstm.sh
@@ -1 +1 @@
-tuning/run_lstm_6i.sh
\ No newline at end of file
+tuning/run_lstm_6j.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index 669740d5f27..7b86453e14b 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7f.sh
\ No newline at end of file
+tuning/run_tdnn_7h.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..a4fa11e0908
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1b.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
index 95f7aef2708..a1be44cdbbf 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
@@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
index 26cdaed29d7..a4333e40b30 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
@@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
new file mode 100755
index 00000000000..34dd378a7fe
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+# 6j is same as 6i but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+# Another minor change is that the final-affine component has param-stddev-0
+# and bias-stddev=0 initialization.
+# This run also accounts for changes in training due to the BackpropTruncationComponent
+
+#System                 blstm_6i  blstm_6j
+#WER on train_dev(tg)      14.11     13.80
+#WER on train_dev(fg)      13.04     12.64
+#WER on eval2000(tg)        16.2      15.6
+#WER on eval2000(fg)        14.6      14.2
+#Final train prob     -0.0615713-0.0552637
+#Final valid prob     -0.0829338-0.0765151
+#Final train prob (xent)      -1.16518 -0.777318
+#Final valid prob (xent)      -1.26028 -0.912595
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6j  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
index fbced146199..ac22e858aea 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
@@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
index c5548cbfa5c..db0a0fe7b1a 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
@@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
new file mode 100755
index 00000000000..90afd1fb4cd
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# 6j is same as 6i but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+# Another minor change is that the final-affine component has param-stddev-0
+# and bias-stddev=0 initialization. The results also account for changes
+# due to BackpropTruncationComponent in place of ClipGradientComponent.
+# Note that removal of layerwise discriminative pretraining does not result
+# in a lot of improvement in LSTMs, compared to TDNNs (7f vs 7g).
+
+#System               lstm_6i_ld5  lstm_6j_ld5
+#WER on train_dev(tg)      14.65     14.66
+#WER on train_dev(fg)      13.38     13.42
+#WER on eval2000(tg)        16.9      16.8
+#WER on eval2000(fg)        15.4      15.4
+#Final train prob     -0.0751668-0.0824531
+#Final valid prob     -0.0928206-0.0989325
+#Final train prob (xent)      -1.34549  -1.15506
+#Final valid prob (xent)      -1.41301  -1.24364
+#
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6j # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
index 28c20c92ab0..aa666e4c5ab 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
@@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then
     --chain.xent-regularize $xent_regularize \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 2.0 \
     --trainer.num-epochs 4 \
@@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
     --egs.stage $get_egs_stage \
     --egs.opts="--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
new file mode 100755
index 00000000000..7a4512097d3
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+
+# 7g is same as 7f but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+
+
+# System                  7f     7g
+# WER on train_dev(tg)    14.46  13.85
+# WER on train_dev(fg)    13.23  12.67
+# WER on eval2000(tg)     17.0   16.5
+# WER on eval2000(fg)     15.4   14.8
+# Final train prob     -0.0882071 -0.0885075
+# Final valid prob     -0.107545  -0.113462
+# Final train prob (xent) -1.26246 -1.25788
+# Final valid prob (xent) -1.35525 -1.37058
+
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7g  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_7e_sp/egs
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  # it doesn't make sense to have -6,0,6 splicing for a chain model
+  # as we compute a sequence of outputs and computation can be shared
+  # this has to be split into two -3,0,3 layers. But I will keep this
+  # to have same setup as 7f
+  relu-renorm-layer name=tdnn6 input=Append(-6,0,6) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
new file mode 100755
index 00000000000..00743ca9ebf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+#System                  tdnn_7g   tdnn_7h
+#WER on train_dev(tg)      13.98     13.84
+#WER on train_dev(fg)      12.78     12.84
+#WER on eval2000(tg)        16.7      16.5
+#WER on eval2000(fg)        14.9      14.8
+#Final train prob     -0.0817467-0.0889771
+#Final valid prob      -0.110475 -0.113102
+#Final train prob (xent)      -1.20065   -1.2533
+#Final valid prob (xent)       -1.3313  -1.36743
+#
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7h  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
new file mode 100755
index 00000000000..1b3e86715ed
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+# Same as 7h but double the number of parameters (27983950 vs 15551509)
+
+set -e
+
+
+#System                  tdnn_7h   tdnn_7i
+#WER on train_dev(tg)      13.84     13.48
+#WER on train_dev(fg)      12.84     12.47
+#WER on eval2000(tg)        16.5      16.4
+#WER on eval2000(fg)        14.8      14.9
+#Final train prob     -0.0889771-0.0785415
+#Final valid prob      -0.113102 -0.105757
+#Final train prob (xent)       -1.2533  -1.15785
+#Final valid prob (xent)      -1.36743  -1.28397
+#
+# configs for 'chain'
+affix=
+stage=12
+train_stage=0
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7i  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_7g_sp/egs
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
new file mode 100755
index 00000000000..b19ea6eafab
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+
+#System                  tdnn_7h   tdnn_7j
+#WER on train_dev(tg)      13.84     14.15
+#WER on train_dev(fg)      12.84     12.96
+#WER on eval2000(tg)        16.5      16.8
+#WER on eval2000(fg)        14.8      15.1
+#Final train prob     -0.0889771-0.0910883
+#Final valid prob      -0.113102 -0.112464
+#Final train prob (xent)       -1.2533  -1.31768
+#Final valid prob (xent)      -1.36743  -1.41603
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7j  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_7g_sp/egs
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  tdnn-relu-renorm-layer name=tdnn2 splice-indexes=-1,0,1 dim=768 subset-dim=384
+  tdnn-relu-renorm-layer name=tdnn3 splice-indexes=-1,0,1 dim=768 subset-dim=384
+  tdnn-relu-renorm-layer name=tdnn4 splice-indexes=-3,0,3 dim=768 subset-dim=384
+  tdnn-relu-renorm-layer name=tdnn5 splice-indexes=-3,0,3 dim=768 subset-dim=384
+  tdnn-relu-renorm-layer name=tdnn6 splice-indexes=-3,0,3 dim=768 subset-dim=384
+  tdnn-relu-renorm-layer name=tdnn7 splice-indexes=-3,0,3 dim=768 subset-dim=384
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=768 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=768 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+  nnet3-info $dir/configs/ref.raw |grep num-param
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
new file mode 100644
index 00000000000..06ae6f49728
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# 7l is based on 7h, but adding a 64 dim lowrank module in the xent branch
+#System                   tdnn_7h    tdnn_7l
+#WER on train_dev(tg)     13.84      13.83
+#WER on train_dev(fg)     12.84      12.88
+#WER on eval2000(tg)      16.5       16.4
+#WER on eval2000(fg)      14.8       14.7
+#Final train prob         -0.089     -0.090
+#Final valid prob         -0.113     -0.116
+#Final train prob (xent)  -1.25      -1.38
+#Final valid prob (xent)  -1.36      -1.48
+#Time consuming one iter  53.56s     48.18s  
+#Time reduction percent   10.1%
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7l  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  relu-renorm-layer name=prefinal-lowrank-xent input=prefinal-xent dim=64 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..e32fdffb69d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,241 @@
+#!/bin/bash
+
+# 1a is a straight forward combination of tdnn_7h and lstm_6j.
+# TDNN layers are stacked before LSTM.
+# This model has the same performance as the BLSTM.
+
+
+#System                  lstm_6j   tdnn_7h  blstm_6j   tdnn_lstm_1a
+#WER on train_dev(tg)      14.66     13.84     13.80     13.42
+#WER on train_dev(fg)      13.42     12.84     12.64     12.42
+#WER on eval2000(tg)        16.8      16.5      15.6      15.7
+#WER on eval2000(fg)        15.4      14.8      14.2      14.2
+#Final train prob     -0.0824531-0.0889771-0.0552637-0.0538088
+#Final valid prob     -0.0989325 -0.113102-0.0765151-0.0800484
+#Final train prob (xent)      -1.15506   -1.2533 -0.777318   -0.7603
+#Final valid prob (xent)      -1.24364  -1.36743 -0.912595 -0.949909
+
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..555afa467fa
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,237 @@
+#!/bin/bash
+
+# Unlike 1a this setup interleaves the TDNN and LSTM layers.
+
+#System               tdnn_lstm_1a_ld5tdnn_lstm_1b_ld5
+#WER on train_dev(tg)      13.42     13.00
+#WER on train_dev(fg)      12.42     12.03
+#WER on eval2000(tg)        15.7      15.3
+#WER on eval2000(fg)        14.2      13.9
+#Final train prob     -0.0538088 -0.056294
+#Final valid prob     -0.0800484-0.0813322
+#Final train prob (xent)       -0.7603 -0.777787
+#Final valid prob (xent)     -0.949909 -0.939146
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 1908b390151..99f6a31e708 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -8,8 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
-. cmd.sh
+#
 
 
 stage=0
@@ -26,7 +25,7 @@ extra_right_context=40
 extra_left_context_initial=-1
 extra_right_context_final=-1
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
@@ -52,9 +51,9 @@ effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -102,7 +101,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -115,21 +114,18 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -149,8 +145,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
@@ -166,7 +161,7 @@ if [ $stage -le 4 ]; then
     --regularization-opts "$regularization_opts" \
     --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri4/graph_sw1_tg
@@ -176,7 +171,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
index 2d2cba4ed93..f422aa92e38 100755
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -108,9 +108,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -136,8 +133,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
index 469db715bb7..a5b80505393 100755
--- a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -119,9 +119,6 @@ fi
 left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
 right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
 
-valid_left_context=$[left_context + frames_per_eg]
-valid_right_context=$[right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -147,8 +144,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/tedlium/s5_r2/RESULTS b/egs/tedlium/s5_r2/RESULTS
index 9e694aadbd6..ec4b9c24a12 100644
--- a/egs/tedlium/s5_r2/RESULTS
+++ b/egs/tedlium/s5_r2/RESULTS
@@ -1,3 +1,7 @@
+# Results based on the Tedlium Release 2 Paper using the original LM given by the Lium Team
+# PAPER Results: 10.1 / 11.1 
+# http://www.lrec-conf.org/proceedings/lrec2014/pdf/1104_Paper.pdf
+
 
 # steps/info/gmm_dir_info.pl exp/mono exp/tri{1,2,3,3_cleaned}
 # exp/mono: nj=20 align prob=-96.77 over 4.65h [retry=1.1%, fail=0.1%] states=127 gauss=1001
@@ -17,50 +21,78 @@
 ######### tri1 results ########
 for d in  exp/tri1/decode_*; do grep Sum $d/*ore*/*ys | utils/best_wer.sh ; done
 # small LM
-%WER 28.3 | 507 17783 | 75.3 17.5 7.2 3.6 28.3 97.2 | 0.053 | exp/tri1/decode_nosp_dev/score_11_0.0/ctm.filt.filt.sys
-%WER 27.5 | 1155 27500 | 75.1 18.2 6.7 2.6 27.5 93.9 | 0.103 | exp/tri1/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys
+%WER 27.8 | 507 17783 | 75.7 17.5 6.8 3.4 27.8 96.6 | 0.071 | exp/tri1/decode_nosp_dev/score_10_0.0/ctm.filt.filt.sys
+%WER 27.3 | 1155 27500 | 75.3 18.4 6.3 2.7 27.3 93.0 | 0.119 | exp/tri1/decode_nosp_test/score_11_0.0/ctm.filt.filt.sys
 # big LM
-%WER 27.1 | 507 17783 | 76.3 16.6 7.0 3.4 27.1 96.1 | 0.018 | exp/tri1/decode_nosp_dev_rescore/score_11_0.0/ctm.filt.filt.sys
-%WER 26.4 | 1155 27500 | 76.4 17.5 6.1 2.8 26.4 92.9 | 0.046 | exp/tri1/decode_nosp_test_rescore/score_11_0.0/ctm.filt.filt.sys
+%WER 26.3 | 507 17783 | 76.8 16.1 7.1 3.1 26.3 95.9 | 0.080 | exp/tri1/decode_nosp_dev_rescore/score_11_0.0/ctm.filt.filt.sys
+%WER 26.2 | 1155 27500 | 76.6 17.3 6.1 2.8 26.2 92.6 | 0.081 | exp/tri1/decode_nosp_test_rescore/score_11_0.0/ctm.filt.filt.sys
 
 ####### tri2 results ##########
 #for d in  exp/tri2/decode_*; do grep Sum $d/score*/*ys | utils/best_wer.sh ; done
 
 # small LM
-%WER 24.0 | 507 17783 | 79.1 14.5 6.3 3.1 24.0 94.3 | 0.003 | exp/tri2/decode_nosp_dev/score_14_0.0/ctm.filt.filt.sys
-%WER 23.0 | 1155 27500 | 79.6 15.3 5.1 2.5 23.0 91.5 | 0.049 | exp/tri2/decode_nosp_test/score_13_0.0/ctm.filt.filt.sys
+%WER 23.6 | 507 17783 | 79.6 14.8 5.6 3.2 23.6 95.1 | 0.024 | exp/tri2/decode_nosp_dev/score_12_0.0/ctm.filt.filt.sys
+%WER 23.2 | 1155 27500 | 79.5 15.5 5.0 2.7 23.2 91.1 | 0.070 | exp/tri2/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys
 # big LM
-%WER 23.1 | 507 17783 | 80.4 13.9 5.7 3.5 23.1 93.7 | -0.046 | exp/tri2/decode_nosp_dev_rescore/score_12_0.0/ctm.filt.filt.sys
-%WER 22.0 | 1155 27500 | 80.5 14.7 4.9 2.5 22.0 91.1 | 0.009 | exp/tri2/decode_nosp_test_rescore/score_13_0.0/ctm.filt.filt.sys
+%WER 22.3 | 507 17783 | 80.7 13.5 5.8 3.0 22.3 93.7 | -0.002 | exp/tri2/decode_nosp_dev_rescore/score_13_0.0/ctm.filt.filt.sys
+%WER 21.9 | 1155 27500 | 80.7 14.6 4.7 2.6 21.9 90.2 | 0.026 | exp/tri2/decode_nosp_test_rescore/score_12_0.0/ctm.filt.filt.sys
 
 # small LM with silence and pronunciation probs.
-%WER 23.3 | 507 17783 | 80.1 14.5 5.4 3.4 23.3 93.7 | 0.031 | exp/tri2/decode_dev/score_14_0.0/ctm.filt.filt.sys
-%WER 22.1 | 1155 27500 | 80.8 15.1 4.1 3.0 22.1 90.8 | 0.058 | exp/tri2/decode_test/score_13_0.0/ctm.filt.filt.sys
+%WER 22.5 | 507 17783 | 80.5 14.0 5.5 3.1 22.5 94.7 | 0.092 | exp/tri2/decode_dev/score_15_0.0/ctm.filt.filt.sys
+%WER 22.1 | 1155 27500 | 80.7 14.9 4.3 2.8 22.1 90.6 | 0.089 | exp/tri2/decode_test/score_13_0.0/ctm.filt.filt.sys
 
 # big LM with silence and pronunciation probs.
-%WER 22.3 | 507 17783 | 81.2 13.7 5.0 3.5 22.3 93.5 | 0.009 | exp/tri2/decode_dev_rescore/score_13_0.0/ctm.filt.filt.sys
-%WER 21.2 | 1155 27500 | 81.6 14.3 4.0 2.8 21.2 90.5 | 0.021 | exp/tri2/decode_test_rescore/score_13_0.0/ctm.filt.filt.sys
+%WER 21.3 | 507 17783 | 81.8 13.1 5.1 3.1 21.3 93.7 | 0.038 | exp/tri2/decode_dev_rescore/score_14_0.0/ctm.filt.filt.sys
+%WER 20.9 | 1155 27500 | 81.9 14.0 4.1 2.8 20.9 90.5 | 0.046 | exp/tri2/decode_test_rescore/score_13_0.0/ctm.filt.filt.sys
 
 ####### tri3 results ##########
 # small LM
-%WER 19.4 | 507 17783 | 83.8 11.6 4.6 3.2 19.4 92.7 | -0.066 | exp/tri3/decode_dev/score_16_0.0/ctm.filt.filt.sys
-%WER 17.4 | 1155 27500 | 84.9 11.6 3.5 2.4 17.4 87.1 | -0.018 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
+%WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
+%WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
 
 # big LM
-%WER 18.3 | 507 17783 | 84.7 10.8 4.5 3.0 18.3 91.3 | -0.112 | exp/tri3/decode_dev_rescore/score_16_0.0/ctm.filt.filt.sys
-%WER 16.5 | 1155 27500 | 85.8 11.0 3.2 2.4 16.5 86.3 | -0.083 | exp/tri3/decode_test_rescore/score_14_0.0/ctm.filt.filt.sys
-
+%WER 17.6 | 507 17783 | 85.0 10.5 4.4 2.6 17.6 90.5 | -0.030 | exp/tri3/decode_dev_rescore/score_16_0.0/ctm.filt.filt.sys
+%WER 16.7 | 1155 27500 | 85.7 10.9 3.4 2.4 16.7 86.4 | -0.044 | exp/tri3/decode_test_rescore/score_14_0.0/ctm.filt.filt.sys
 
 
 for d in  exp/tri3_cleaned/decode_*; do grep Sum $d/score*/*ys | utils/best_wer.sh ; done
 # tri3 after cleaning, small LM.
-%WER 19.8 | 507 17783 | 83.7 11.9 4.4 3.5 19.8 93.9 | -0.114 | exp/tri3_cleaned/decode_dev/score_14_0.0/ctm.filt.filt.sys
-%WER 17.5 | 1155 27500 | 84.9 11.6 3.5 2.4 17.5 88.1 | -0.024 | exp/tri3_cleaned/decode_test/score_15_0.0/ctm.filt.filt.sys
+#
+%WER 19.0 | 507 17783 | 83.9 11.4 4.7 2.9 19.0 92.1 | -0.054 | exp/tri3_cleaned/decode_dev/score_13_0.5/ctm.filt.filt.sys
+%WER 17.6 | 1155 27500 | 84.8 11.7 3.5 2.4 17.6 87.6 | 0.001 | exp/tri3_cleaned/decode_test/score_15_0.0/ctm.filt.filt.sys
 
 # tri3 after cleaning, large LM.
-%WER 18.8 | 507 17783 | 84.5 11.1 4.4 3.2 18.8 91.5 | -0.137 | exp/tri3_cleaned/decode_dev_rescore/score_15_0.0/ctm.filt.filt.sys
-%WER 16.7 | 1155 27500 | 85.6 10.9 3.4 2.3 16.7 87.0 | -0.066 | exp/tri3_cleaned/decode_test_rescore/score_15_0.0/ctm.filt.filt.sys
+%WER 17.9 | 507 17783 | 85.1 10.5 4.4 3.0 17.9 90.9 | -0.055 | exp/tri3_cleaned/decode_dev_rescore/score_15_0.0/ctm.filt.filt.sys
+%WER 16.6 | 1155 27500 | 85.8 10.9 3.4 2.4 16.6 86.4 | -0.058 | exp/tri3_cleaned/decode_test_rescore/score_15_0.0/ctm.filt.filt.sys
+
+
+##########  nnet3+chain systems
+#
+# chain+TDNN, small LM
+%WER 9.7 | 507 17783 | 91.7 5.8 2.5 1.4 9.7 78.7 | 0.097 | exp/chain_cleaned/tdnn_sp_bi/decode_dev/score_10_0.0/ctm.filt.filt.sys
+%WER 9.5 | 1155 27500 | 91.7 5.8 2.5 1.2 9.5 72.5 | 0.079 | exp/chain_cleaned/tdnn_sp_bi/decode_test/score_10_0.0/ctm.filt.filt.sys
 
+# chain+TDNN, large LM
+%WER 9.0 | 507 17783 | 92.3 5.3 2.4 1.3 9.0 76.7 | 0.067 | exp/chain_cleaned/tdnn_sp_bi/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys
+%WER 9.0 | 1155 27500 | 92.2 5.3 2.5 1.2 9.0 71.3 | 0.064 | exp/chain_cleaned/tdnn_sp_bi/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+  # chain+TDNN systems ran without cleanup, using the command:
+  # local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
+  # for d in exp/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*/*ys | utils/best_wer.sh; done
+  # This is about 0.1 (dev) / 0.4 (test) % worse than the corresponding results with cleanup.
+  %WER 9.8 | 507 17783 | 91.6 6.0 2.4 1.5 9.8 80.1 | -0.038 | exp/chain/tdnn_sp_bi/decode_dev/score_8_0.0/ctm.filt.filt.sys
+  %WER 9.9 | 1155 27500 | 91.4 5.7 2.9 1.3 9.9 74.9 | 0.083 | exp/chain/tdnn_sp_bi/decode_test/score_9_0.0/ctm.filt.filt.sys
+  %WER 9.1 | 507 17783 | 92.3 5.5 2.3 1.4 9.1 77.5 | 0.011 | exp/chain/tdnn_sp_bi/decode_dev_rescore/score_8_0.0/ctm.filt.filt.sys
+  %WER 9.4 | 1155 27500 | 91.9 5.6 2.5 1.4 9.4 72.7 | 0.018 | exp/chain/tdnn_sp_bi/decode_test_rescore/score_8_0.0/ctm.filt.filt.sys
+####################################################################################################################
+For the record, results with unpruned LM:
+%WER 8.2 | 507 17783 | 92.8 4.5 2.6 1.1 8.2 70.8 | -0.036 | exp/chain/tdnn_sp_bi/decode_dev_1848_rescore/score_9_0.0/ctm.filt.filt.sys
+%WER 9.3 | 1155 27500 | 91.8 5.1 3.0 1.2 9.3 71.7 | -0.008 | exp/chain/tdnn_sp_bi/decode_test_1848_rescore/score_9_0.0/ctm.filt.filt.sys
+
+
+#####################################################################################################################
+# BELOW FOR REFERENCE, old results with the Cantab LM -- including Nnet3 results tdnn + blstm
+#####################################################################################################################
 
 ####### nnet3 results #####
 
@@ -73,6 +105,16 @@ for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep Sum $x/*ore*/*ys | utils/be
 %WER 11.9 | 507 17783 | 90.0 7.0 3.0 1.9 11.9 81.9 | -0.072 | exp/nnet3_cleaned/tdnn_sp/decode_dev_rescore/score_11_0.0/ctm.filt.filt.sys
 %WER 10.8 | 1155 27500 | 90.6 6.7 2.7 1.4 10.8 76.6 | -0.101 | exp/nnet3_cleaned/tdnn_sp/decode_test_rescore/score_11_0.0/ctm.filt.filt.sys
 
+# BLSTM small LM
+# The results are with ClipGradientComponent and without deriv_time fix, so it may not reflect the latest changes
+# for x in exp/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $x/*ore*/*ys | utils/best_wer.sh; done
+%WER 11.1 | 507 17783 | 90.5 6.8 2.7 1.6 11.1 80.7 | -0.251 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/score_10_0.0/ctm.filt.filt.sys
+%WER 10.2 | 1155 27500 | 91.0 6.4 2.6 1.2 10.2 75.5 | -0.278 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_test/score_10_0.0/ctm.filt.filt.sys
+
+# BLSTM large LM
+%WER 10.6 | 507 17783 | 91.0 6.5 2.5 1.6 10.6 79.3 | -0.275 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys
+%WER 9.9 | 1155 27500 | 91.3 6.1 2.6 1.2 9.9 74.1 | -0.306 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
   # nnet3 results without cleanup, run with:
   # local/nnet3/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
   # This is only about 0.1% worse than the baseline with cleanup... the cleanup helps
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
index 82647b81767..a60f76a547d 100755
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn.sh
@@ -28,7 +28,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
 tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
+common_egs_dir= #exp/chain/tdnn_sp_bi/egs  # you can set this to use previously dumped egs.
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -196,4 +196,4 @@ if [ $stage -le 20 ]; then
     exit 1
   fi
 fi
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/tedlium/s5_r2/local/download_data.sh b/egs/tedlium/s5_r2/local/download_data.sh
index 93b8a005780..945ba378cef 100755
--- a/egs/tedlium/s5_r2/local/download_data.sh
+++ b/egs/tedlium/s5_r2/local/download_data.sh
@@ -34,18 +34,5 @@ if [ "$num_sph" != 1514 ]; then
   exit 1
 fi
 
-# Language models (Cantab Research):
-if [ ! -e cantab-TEDLIUM ]; then
-  echo "$0: Downloading \"http://www.openslr.org/resources/27/cantab-TEDLIUM-partial.tar.bz2\". "
-  wget --no-verbose --output-document=- http://www.openslr.org/resources/27/cantab-TEDLIUM-partial.tar.bz2 | bzcat | tar --extract --file=- || exit 1
-else
-  echo "$0: directory cantab-TEDLIUM already exists, not re-downloading."
-fi
-
-if [ ! -s cantab-TEDLIUM/cantab-TEDLIUM.dct ]; then
-  echo "$0: expected file db/cantab-TEDLIUM/cantab-TEDLIUM.dct to exist and be nonempty."
-  exit 1
-fi
-
 exit 0
 
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh
new file mode 100755
index 00000000000..32933d789a4
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh
@@ -0,0 +1,48 @@
+stage=0
+train_stage=-10
+affix=bidirectional
+common_egs_dir=
+remove_egs=true
+train_set=train_cleaned
+gmm=tri3_cleaned
+nnet3_affix=_cleaned
+
+# BLSTM params
+cell_dim=1024
+rp_dim=128
+nrp_dim=128
+chunk_left_context=40
+chunk_right_context=40
+
+# training options
+srand=0
+num_jobs_initial=3
+num_jobs_final=15
+samples_per_iter=20000
+num_epochs=6
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+local/nnet3/run_lstm.sh --affix $affix \
+                         --srand $srand \
+                         --stage $stage \
+                         --train-stage $train_stage \
+                         --train-set $train_set \
+                         --gmm $gmm \
+                         --nnet3-affix $nnet3_affix \
+                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+                         --label-delay 0 \
+                         --cell-dim $cell_dim \
+                         --recurrent-projection-dim $rp_dim \
+                         --non-recurrent-projection-dim $nrp_dim \
+                         --common-egs-dir "$common_egs_dir" \
+                         --chunk-left-context $chunk_left_context \
+                         --chunk-right-context $chunk_right_context \
+                         --num-jobs-initial $num_jobs_initial \
+                         --num-jobs-final $num_jobs_final \
+                         --samples-per-iter $samples_per_iter \
+                         --num-epochs $num_epochs \
+                         --remove-egs $remove_egs
+
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..5fbeb79991c
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+#    This is the standard "lstm" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# by default, with cleanup:
+# local/nnet3/run_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=15
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/prepare_dict.sh b/egs/tedlium/s5_r2/local/prepare_dict.sh
index 9ba31893b22..18837c21085 100755
--- a/egs/tedlium/s5_r2/local/prepare_dict.sh
+++ b/egs/tedlium/s5_r2/local/prepare_dict.sh
@@ -3,13 +3,14 @@
 # Copyright  2014 Nickolay V. Shmyrev
 #            2014 Brno University of Technology (Author: Karel Vesely)
 #            2016 Daniel Galvez
+#            2016 Vincent Nguyen
 # Apache 2.0
 #
 
 dir=data/local/dict_nosp
 mkdir -p $dir
 
-srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct
+srcdict=db/TEDLIUM_release2/TEDLIUM.152k.dic
 
 [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
 
diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex.sh
new file mode 100755
index 00000000000..4960fbd848e
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/run_learn_lex.sh
@@ -0,0 +1,136 @@
+#! /bin/bash
+#
+# This script demonstrates a lexicon learning recipe, which aims to imrove
+# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh
+# for explanation of the options. 
+#
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+oov_symbol="<unk>"
+# The user may have an English g2p model ready.
+g2p_mdl_dir=
+# The dir which contains the reference lexicon (most probably hand-derived)
+# we want to expand/improve, and nonsilence_phones.txt,.etc which we need  
+# for building new dict dirs.
+ref_dict=data/local/dict
+# acoustic training data we use to get alternative
+# pronunciations and collet acoustic evidence.
+data=data/train
+# the cut-off parameter used to select pronunciation candidates from phone
+# decoding. We remove pronunciations with probabilities less than this value
+# after normalizing the probs s.t. the max-prob is 1.0 for each word."
+min_prob=0.4
+# Mean of priors (summing up to 1) assigned to three exclusive pronunciation
+# source: reference lexicon, g2p, and phone decoding (used in the Bayesian
+# pronunciation selection procedure). We recommend setting a larger prior
+# mean for the reference lexicon, e.g. '0.6,0.2,0.2'.
+prior_mean="0.7,0.2,0.1"        
+# Total amount of prior counts we add to all pronunciation candidates of
+# each word. By multiplying it with the prior mean of a source, and then dividing
+# by the number of candidates (for a word) from this source, we get the
+# prior counts we actually add to each candidate.
+prior_counts_tot=15
+# In the Bayesian pronunciation selection procedure, for each word, we
+# choose candidates (from all three sources) with highest posteriors
+# until the total prob mass hit this amount.
+# It's used in a similar fashion when we apply G2P.
+variants_prob_mass=0.6
+# In the Bayesian pronunciation selection procedure, for each word,
+# after the total prob mass of selected candidates hit variants-prob-mass,
+# we continue to pick up reference candidates with highest posteriors
+# until the total prob mass hit this amount (must >= variants_prob_mass).
+variants_prob_mass_ref=0.95
+# Intermediate outputs of the lexicon learning stage will be put into dir
+dir=exp/tri3_lex_work
+nj=35
+decode_nj=30
+stage=0
+lexlearn_stage=0
+
+. utils/parse_options.sh # accept options
+
+
+# The reference vocab is the list of words which we already have hand-derived pronunciations.
+ref_vocab=data/local/vocab.txt
+cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; 
+
+# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
+# in acoustic training data.
+if [ $stage -le 0 ]; then
+  if [ -z $g2p_mdl_dir ]; then
+    g2p_mdl_dir=exp/g2p
+    steps/dict/train_g2p.sh --cmd "$decode_cmd --mem 4G" $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
+  fi
+  awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
+    $data/text | sort -u > $data/train_vocab.txt || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
+    $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
+  steps/dict/apply_g2p.sh --var-counts 4 $data/oov_train.txt \
+    $g2p_mdl_dir exp/g2p/oov_lex_train || exit 1;
+  cat exp/g2p/oov_lex_train/lexicon.lex | cut -f1,3 | \
+    tr -s '\t' ' ' | sort | uniq > $data/lexicon_oov_g2p.txt || exit 1;
+fi
+
+# Learn a lexicon based on the acoustic training data and the reference lexicon.
+if [ $stage -le 1 ]; then
+  steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
+    --min-prob $min_prob --variants-prob-mass $variants_prob_mass \
+    --variants-prob-mass-ref $variants_prob_mass_ref  \
+    --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \
+    --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl true \
+    $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_learned_nosp \
+    $dir || exit 1;
+fi
+
+# Add pronounciation probs to the learned lexicon.
+if [ $stage -le 1 ]; then
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_learned_nosp $oov_symbol data/local/lang_learned_nosp data/lang_learned_nosp || exit 1;
+  
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    $data data/lang_learned_nosp exp/tri2 exp/tri2_ali_learned_lex_nosp || exit 1;
+  
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_learned_nosp exp/tri2_ali_learned_lex_nosp || exit 1;
+  
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_learned_nosp exp/tri2_ali_learned_lex_nosp/pron_counts_nowb.txt \
+    exp/tri2_ali_learned_lex_nosp/sil_counts_nowb.txt \
+    exp/tri2_ali_learned_lex_nosp/pron_bigram_counts_nowb.txt data/local/dict_learned || exit 1;
+  
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_learned $oov_symbol data/local/lang_learned data/lang_learned || exit 1;
+fi
+
+# Re-train the acoustic model using the learned lexicon
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $data data/lang_learned exp/tri3 exp/tri3_ali_learned_lex || exit 1;
+  
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $data data/lang_learned exp/tri3_ali_learned_lex exp/tri3_learned_lex || exit 1;
+fi
+
+# Decode
+if [ $stage -le 3 ]; then
+  cp -rT data/lang_learned data/lang_learned_rescore || exit 1;
+  ! cmp data/lang_nosp/words.txt data/lang_learned/words.txt &&\
+    echo "$0: The vocab of the learned lexicon and the reference vocab may be incompatible."
+  cp data/lang_nosp/G.fst data/lang_learned/
+  cp data/lang_nosp_rescore/G.carpa data/lang_learned_rescore/
+  utils/mkgraph.sh data/lang_learned exp/tri3_learned_lex exp/tri3_learned_lex/graph || exit 1;
+  
+  for dset in dev test; do
+  (  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri3_learned_lex/graph data/${dset} exp/tri3_learned_lex/decode_${dset} || exit 1;
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_learned data/lang_learned_rescore \
+       data/${dset} exp/tri3_learned_lex/decode_${dset} exp/tri3_learned_lex/decode_${dset}_rescore || exit 1;
+  ) &
+  done
+fi
+
+wait
diff --git a/egs/tedlium/s5_r2/local/ted_train_lm.sh b/egs/tedlium/s5_r2/local/ted_train_lm.sh
index 3d46726b5ca..3a1bef567fb 100755
--- a/egs/tedlium/s5_r2/local/ted_train_lm.sh
+++ b/egs/tedlium/s5_r2/local/ted_train_lm.sh
@@ -44,7 +44,7 @@ num_dev_sentences=10000
 # These example numbers of metaparameters is for 4-gram model (with min-counts)
 # running with train_lm.py.
 # The dev perplexity should be close to the non-bypassed model.
-bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.837,0.023,0.761,0.065,0.029,0.015,0.999,0.361,0.157,0.080,0.999,0.625,0.2164,0.2162"
+bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406"
 # Note: to use these example parameters, you may need to remove the .done files
 # to make sure the make_lm_dir.py be called and tain only 3-gram model
 #for order in 3; do
@@ -58,8 +58,8 @@ if [ $stage -le 0 ]; then
 
   rm ${dir}/data/text/* 2>/dev/null || true
 
-  # cantab-TEDLIUM is the larger data source.  gzip it.
-  sed 's/ <\/s>//g' < db/cantab-TEDLIUM/cantab-TEDLIUM.txt | gzip -c  > ${dir}/data/text/train.txt.gz
+  # Unzip TEDLIUM 6 data sources, normalize apostrophe+suffix to previous word, gzip the result.
+  gunzip -c db/TEDLIUM_release2/LM/*.en.gz | sed 's/ <\/s>//g' | local/join_suffix.py | gzip -c  > ${dir}/data/text/train.txt.gz
   # use a subset of the annotated training data as the dev set .
   # Note: the name 'dev' is treated specially by pocolm, it automatically
   # becomes the dev set.
@@ -76,7 +76,7 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/dev/text  > ${dir}/data/real_dev_set.txt
 
   # get wordlist
-  awk '{print $1}' db/cantab-TEDLIUM/cantab-TEDLIUM.dct | sort | uniq > ${dir}/data/wordlist
+  awk '{print $1}' db/TEDLIUM_release2/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist
 fi
 
 order=4
@@ -103,13 +103,7 @@ if [ $stage -le 1 ]; then
                ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
   get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
-
-  # current results, after adding --limit-unk-history=true:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/wordlist_4.pocolm was -5.13486225358 per word [perplexity = 169.840923284] over 18290.0 words.
-  # older results (after adding min-counts):
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/wordlist_4.pocolm was -5.13902242865 per word [perplexity = 170.514153159] over 18290.0 words.
-  # even older results, before adding min-counts:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4 was -5.10576291033 per word [perplexity = 164.969879761] over 18290.0 words.
+  #[perplexity = 157.87] over 18290.0 words
 fi
 
 if [ $stage -le 2 ]; then
@@ -121,10 +115,8 @@ if [ $stage -le 2 ]; then
   get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
 
   # current results, after adding --limit-unk-history=true:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.17558740241 per word [perplexity = 176.90049554] over 18290.0 words.
-  # older results, after adding min-counts:
-  # get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.17638942756 per word [perplexity = 177.006688203] over 18290.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words.
+
 
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
@@ -140,11 +132,8 @@ if [ $stage -le 3 ]; then
   get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
 
   # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.28036622198 per word [perplexity = 196.441803486] over 18290.0 words.
-  # older results, after adding min-counts:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.28346290049 per word [perplexity = 197.123843355] over 18290.0 words.
-  # even older results, before adding min-counts:
-  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27623197813 per word [perplexity = 195.631341646] over 18290.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words.
+
 
   format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
 fi
diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh
index 754cec0494d..19bc92a738c 100755
--- a/egs/tedlium/s5_r2/run.sh
+++ b/egs/tedlium/s5_r2/run.sh
@@ -185,7 +185,7 @@ fi
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
-  local/chain/run_tdnn.sh
+  local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
 fi
 
 # The nnet3 TDNN recipe:
diff --git a/egs/voxforge/s5/local/voxforge_prepare_dict.sh b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
index d27e8be69f9..9936ba7b556 100755
--- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
@@ -12,7 +12,7 @@ echo "=== Preparing the dictionary ..."
 
 if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then
   echo "--- Downloading CMU dictionary ..."
-  mkdir -p $locdict 
+  mkdir -p $locdict
   svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
     $locdict/cmudict || exit 1;
 fi
@@ -64,6 +64,7 @@ g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-
 
 cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\
   sort > $locdict/lexicon.txt
+rm $locdict/lexiconp.txt 2>/dev/null || true
 
 echo "--- Prepare phone lists ..."
 echo SIL > $locdict/silence_phones.txt
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index ad1c12a835a..0b6d7bb3970 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -123,9 +123,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -151,8 +148,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
index b7ace847c6a..a514e354eef 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -107,9 +107,6 @@ model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | a
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
@@ -135,8 +132,7 @@ if [ -z "$degs_dir" ]; then
       --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      $frame_subsampling_opt \
       --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
diff --git a/egs/wsj/s5/local/run_segmentation.sh b/egs/wsj/s5/local/run_segmentation.sh
index 553260c0f0c..458536162cb 100755
--- a/egs/wsj/s5/local/run_segmentation.sh
+++ b/egs/wsj/s5/local/run_segmentation.sh
@@ -8,52 +8,75 @@
 # bigram language model built from the reference, and then work out the
 # segmentation from a ctm like file.
 
+stage=0
+
 . ./cmd.sh
 . ./path.sh
 
-local/append_utterances.sh data/train_si284 data/train_si284_long
-steps/cleanup/split_long_utterance.sh \
-  --seg-length 30 --overlap-length 5 \
-  data/train_si284_long data/train_si284_split
+if [ $stage -le 0 ]; then
+  local/append_utterances.sh data/train_si284 data/train_si284_long
+  steps/cleanup/split_long_utterance.sh \
+    --seg-length 30 --overlap-length 5 \
+    data/train_si284_long data/train_si284_split
+fi
 
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
-  data/train_si284_split exp/make_mfcc/train_si284_split mfcc || exit 1;
-steps/compute_cmvn_stats.sh data/train_si284_split \
-  exp/make_mfcc/train_si284_split mfcc || exit 1;
+if [ $stage -le 1 ]; then
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
+                     data/train_si284_split exp/make_mfcc/train_si284_split mfcc || exit 1;
+  steps/compute_cmvn_stats.sh data/train_si284_split \
+                              exp/make_mfcc/train_si284_split mfcc || exit 1;
+fi
 
-steps/cleanup/make_segmentation_graph.sh \
-  --cmd "$mkgraph_cmd" --nj 32 \
-  data/train_si284_split/ data/lang exp/tri2b/ \
-  exp/tri2b/graph_train_si284_split || exit 1;
+if [ $stage -le 2 ]; then
+  steps/cleanup/make_segmentation_graph.sh \
+    --cmd "$mkgraph_cmd" --nj 32 \
+    data/train_si284_split/ data/lang exp/tri2b/ \
+    exp/tri2b/graph_train_si284_split || exit 1;
+fi
 
-steps/cleanup/decode_segmentation.sh \
-  --nj 64 --cmd "$decode_cmd" --skip-scoring true \
-  exp/tri2b/graph_train_si284_split/lats \
-  data/train_si284_split exp/tri2b/decode_train_si284_split || exit 1;
+if [ $stage -le 3 ]; then
+  steps/cleanup/decode_segmentation.sh \
+    --nj 64 --cmd "$decode_cmd" --skip-scoring true \
+    exp/tri2b/graph_train_si284_split \
+    data/train_si284_split exp/tri2b/decode_train_si284_split || exit 1;
+fi
 
-steps/get_ctm.sh --cmd "$decode_cmd" data/train_si284_split \
-  exp/tri2b/graph_train_si284_split exp/tri2b/decode_train_si284_split
+if [ $stage -le 4 ]; then
+  steps/get_ctm.sh --cmd "$decode_cmd" data/train_si284_split \
+    exp/tri2b/graph_train_si284_split exp/tri2b/decode_train_si284_split
+fi
 
-steps/cleanup/make_segmentation_data_dir.sh --wer-cutoff 0.9 \
-  --min-sil-length 0.5 --max-seg-length 15 --min-seg-length 1 \
-  exp/tri2b/decode_train_si284_split/score_10/train_si284_split.ctm \
-  data/train_si284_split data/train_si284_reseg
+if [ $stage -le 5 ]; then
+  steps/cleanup/make_segmentation_data_dir.sh --wer-cutoff 0.9 \
+    --min-sil-length 0.5 --max-seg-length 15 --min-seg-length 1 \
+    exp/tri2b/decode_train_si284_split/score_10/train_si284_split.ctm \
+   data/train_si284_split data/train_si284_reseg
+fi
 
 # Now, use the re-segmented data for training.
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
-  data/train_si284_reseg exp/make_mfcc/train_si284_reseg mfcc || exit 1;
-steps/compute_cmvn_stats.sh data/train_si284_reseg \
-  exp/make_mfcc/train_si284_reseg mfcc || exit 1;
-
-steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
-  data/train_si284_reseg data/lang exp/tri3b exp/tri3b_ali_si284_reseg || exit 1;
-
-steps/train_sat.sh  --cmd "$train_cmd" \
-  4200 40000 data/train_si284_reseg \
-  data/lang exp/tri3b_ali_si284_reseg exp/tri4c || exit 1;
-
-utils/mkgraph.sh data/lang_test_tgpr exp/tri4c exp/tri4c/graph_tgpr || exit 1;
-steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri4c/graph_tgpr data/test_dev93 exp/tri4c/decode_tgpr_dev93 || exit 1;
-steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4c/graph_tgpr data/test_eval92 exp/tri4c/decode_tgpr_eval92 || exit 1;
+if [ $stage -le 6 ]; then
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 64 \
+    data/train_si284_reseg exp/make_mfcc/train_si284_reseg mfcc || exit 1;
+  steps/compute_cmvn_stats.sh data/train_si284_reseg \
+                              exp/make_mfcc/train_si284_reseg mfcc || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+    data/train_si284_reseg data/lang exp/tri3b exp/tri3b_ali_si284_reseg || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    4200 40000 data/train_si284_reseg \
+    data/lang exp/tri3b_ali_si284_reseg exp/tri4c || exit 1;
+fi
+
+
+if [ $stage -le 9 ]; then
+  utils/mkgraph.sh data/lang_test_tgpr exp/tri4c exp/tri4c/graph_tgpr || exit 1;
+  steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
+    exp/tri4c/graph_tgpr data/test_dev93 exp/tri4c/decode_tgpr_dev93 || exit 1;
+  steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
+    exp/tri4c/graph_tgpr data/test_eval92 exp/tri4c/decode_tgpr_eval92 || exit 1;
+fi
diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
index d1819bb51e2..2660ebce479 100755
--- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
+++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
@@ -228,7 +228,7 @@ sub SplitLongSegment {
                            $aligned_ctm->[$seg_end_index]->[2] -
                            $aligned_ctm->[$seg_start_index]->[1];
   my $current_seg_index = $seg_start_index;
-  my $aligned_ctm_size = scalar(@{$aligned_ctm});    
+  my $aligned_ctm_size = scalar(@{$aligned_ctm});
   while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size-1) {
     my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                     $seg_end_index, $max_seg_length);
@@ -318,7 +318,7 @@ sub ProcessWav {
             $aligned_ctm[-1]->[3] += 1;
           } else {
             push(@aligned_ctm, ["<eps>", $start, $dur, 1]);
-          } 
+          }
         } else {
           # Case 2.3: substitution.
           push(@aligned_ctm, [$ref_word, $start, $dur, 1]);
@@ -417,11 +417,21 @@ sub InsertSilence {
   my @col = split;
   @col >= 2 || die "Error: bad line $_\n";
   my $wav = shift @col;
-  my @pairs = split(" $separator ", join(" ", @col));
-  for (my $x = 0; $x < @pairs; $x += 1) {
-    my @col1 = split(" ", $pairs[$x]);
-    @col1 == 2 || die "Error: bad pair $pairs[$x]\n";
-    $pairs[$x] = \@col1;
+  if ( (@col + 0) % 3 != 2) {
+    die "Bad line in align-text output (unexpected number of fields): $_";
+  }
+  my @pairs = ();
+
+  for (my $x = 0; $x * 3 + 2 < @col; $x++) {
+    my $first_word = $col[$x * 3];
+    my $second_word = $col[$x * 3 + 1];
+    if ($x * 3 + 2 < @col) {
+      if ($col[$x*3 + 2] != $separator) {
+        die "Bad line in align-text output (expected separator '$separator'): $_";
+      }
+    }
+    # the [ ] expression returns a reference to a new anonymous array.
+    push(@pairs, [ $first_word, $second_word ]);
   }
   ! defined($aligned{$wav}) || die "Error: $wav has already been processed\n";
   $aligned{$wav} = \@pairs;
diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
index b7eaeb1319f..9091764924a 100755
--- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
+++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
@@ -117,23 +117,34 @@ if [ $stage -le 8 ]; then
 
   export LC_ALL=C
 
-  cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \
-     sort >$dir/word_processed.ctm
+  cat $dir/word.ctm  | awk '{printf("%s-%s %010.0f START %s\n", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s\n", $1, $2, 1000*($3+$4), $5);}' | \
+    sort > $dir/word_processed.ctm
 
-  cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \
-     sort >$dir/phone_processed.ctm
+  # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm
+  cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s\n", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - \
+    > $dir/phone_processed.ctm
 
   # merge-sort both ctm's
   sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm
-
 fi
 
+  # after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s. 
 if [ $stage -le 9 ]; then
-  awk '{print $3, $4}' $dir/combined.ctm | \
-     perl -e ' while (<>) { chop; @A = split(" ", $_); ($a,$b) = @A;
+  awk '{print $1, $3, $4}' $dir/combined.ctm | \
+     perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A;
      if ($a eq "START") { $cur_word = $b; @phones = (); }
-     if ($a eq "END") { print $cur_word, " ", join(" ", @phones), "\n"; }
-     if ($a eq "PHONE") { push @phones, $b; }} ' | sort | uniq -c | sort -nr > $dir/prons.txt
+     if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), "\n"; }
+     if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "<eps>", " ", $b, "\n";} else {push @phones, $b;}} $prev = $a;} ' |\
+     awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i;
+     if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;} 
+     if(merge_prev==1) {print utt_prev, word_prev, pron_prev;};
+     merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;}
+     END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt
+  
+  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words
+  steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \
+    sort -nr > $dir/prons.txt  
 fi
 
 if [ $stage -le 10 ]; then
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
new file mode 100755
index 00000000000..414875f9013
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+import argparse
+import sys
+import warnings
+
+# Collect pronounciation stats from a ctm_prons.txt file of the form output
+# by steps/cleanup/debug_lexicon.sh.  This input file has lines of the form:
+#  utt_id word phone1 phone2 .. phoneN
+#  e.g.
+#  foo-bar123-342  hello h eh l l ow
+# (and this script does require that lines from the same utterance be ordered in
+# order of time).
+# The output of this program is word pronunciation stats of the form:
+#  count word phone1 .. phoneN
+#  e.g.:
+#  24.0  hello h ax l l ow
+# This program uses various heuristics to account for the fact that in the input ctm_prons.txt
+# file may not always be well aligned.  As a result of some of these heuristics the counts will
+# not always be integers.
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Accumulate pronounciation statistics from "
+                                     "a ctm_prons.txt file.",
+                                     epilog = "See steps/cleanup/debug_lexicon.sh for example")
+    parser.add_argument("ctm_prons_file", metavar = "<ctm-prons-file>", type = str,
+                        help = "File containing word-pronounciation alignments obtained from a ctm file; "
+                        "It represents phonetic decoding results, aligned with word boundaries obtained"
+                        "from forced alignments."
+                        "each line must be <utt_id> <word> <phones>")
+    parser.add_argument("silence_file", metavar = "<silphone-file>", type = str,
+                        help = "File containing a list of silence phones.")
+    parser.add_argument("optional_silence_file", metavar = "<optional_silence>", type = str,
+                        help = "File containing the optional silence phone. We'll be replacing empty prons by this,"
+                        "because empty prons would cause a problem for lattice word alignment.")
+    parser.add_argument("non_scored_words_file", metavar = "<non-scored-words-file>", type = str,
+                        help = "File containing a list of non-scored words.")
+    parser.add_argument("stats_file", metavar = "<stats-file>", type = str,
+                        help = "Write accumulated statitistics to this file; each line represents how many times "
+                        "a specific word-pronunciation pair appears in the phonetic decoding results (ctm_pron_file)."
+                        "each line is <count> <word> <phones>")
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.ctm_prons_file == "-":
+        args.ctm_prons_file_handle = sys.stdin
+    else:
+        args.ctm_prons_file_handle = open(args.ctm_prons_file)
+    args.non_scored_words_file_handle = open(args.non_scored_words_file)
+    args.silence_file_handle = open(args.silence_file)
+    args.optional_silence_file_handle = open(args.optional_silence_file)
+    if args.stats_file == "-":
+        args.stats_file_handle = sys.stdout
+    else:
+        args.stats_file_handle = open(args.stats_file, "w")
+    return args
+
+def ReadEntries(file_handle):
+    entries = set()
+    for line in file_handle:
+        entries.add(line.strip())
+    return entries
+
+# Basically, this function generates an "info" list from a ctm_prons file.
+# Each entry in the list represents the pronounciation candidate(s) of a word.
+# For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
+# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
+# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left", 
+# which includes phones before the first silphone, and "nonsil_right", which includes
+# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL', 
+# nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
+# in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
+# only if it's nonsil_right segment is not empty, which may be used when processing
+# the next word.
+# 
+# Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
+# when there is a preceding/following <eps>, like in the following example, we
+# assume the phones aligned to <eps> should be statistically distributed
+# to its neighboring words (BTW we assume there are no consecutive <eps> within an utterance.)
+# Thus we append the "nonsil_left" segment of these phones to the pronounciation
+# of the preceding word, if the last phone of this pronounciation is not a silence phone,
+# Similarly we can add a pron candidate to the following word.
+# 
+# For example, for the following part of a ctm_prons file:
+# 911Mothers_2010W-0010916-0012901-1 other AH DH ER
+# 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
+# 911Mothers_2010W-0010916-0012901-1 because IH K HH W AA Z AH
+# 911Mothers_2010W-0010916-0012901-1 <eps> V SIL
+# 911Mothers_2010W-0010916-0012901-1 when W EH N
+# 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
+# 911Mothers_2010W-0010916-0012901-1 <eps> SIL
+# 911Mothers_2010W-0010916-0012901-1 heard HH ER 
+# 911Mothers_2010W-0010916-0012901-1 <eps> D
+# 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
+# 911Mothers_2010W-0010916-0012901-1 my M AY
+# 
+# The corresponding segment in the "info" list is:
+# [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
+# [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
+# [911Mothers_2010W-0010916-0012901-1, because, set('IH K HH W AA Z AH', 'B IH K HH W AA Z AH', 'IH K HH W AA Z AH V', 'B IH K HH W AA Z AH V')]
+# [911Mothers_2010W-0010916-0012901-1, when, set('W EH N')]
+# [911Mothers_2010W-0010916-0012901-1, people, set('P IY P AH L')]
+# [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
+# [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
+# [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
+# 
+# Then we accumulate pronouciation stats from "info". Basically, for each occurence
+# of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
+# example, each pron candidate of "because" gets a count of 1/4. The stats is stored
+# in a dictionary (word, pron) : count.
+
+def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_prons_file_handle):
+    info = []
+    for line in ctm_prons_file_handle.readlines():
+        splits = line.strip().split()
+        utt = splits[0]
+        word = splits[1]
+        phones = splits[2:]
+        if phones == []:
+            phones = [optional_silence]
+        # extract the nonsil_left and nonsil_right segments, and then try to
+        # append nonsil_left to the pron candidates of preceding word, getting
+        # extended pron candidates.
+        # Note: the ctm_pron file may have cases like:
+        # KevinStone_2010U-0024782-0025580-1 [UH] EH
+        # KevinStone_2010U-0024782-0025580-1 fda F T
+        # KevinStone_2010U-0024782-0025580-1 [NOISE] IY EY
+        # which means non-scored-words (except oov symbol <unk>/<UNK>) behaves like <eps>.
+        # So we apply the same merging method in these cases.
+        if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
+            nonsil_left = []
+            nonsil_right = [] 
+            for phone in phones:
+                if phone in silphones:
+                    break
+                nonsil_left.append(phone)
+            
+            for phone in reversed(phones):
+                if phone in silphones:
+                    break
+                nonsil_right.insert(0, phone)
+            
+            # info[-1][0] is the utt_id of the last entry
+            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: 
+                # pron_ext is a set of extended pron candidates. 
+                pron_ext = set()
+                # info[-1][2] is the set of pron candidates of the last entry.
+                for pron in info[-1][2]:
+                    # skip generating the extended pron candidate if
+                    # the pron ends with a silphone.
+                    ends_with_sil = False
+                    for sil in silphones:
+                        if pron.endswith(sil):
+                            ends_with_sil = True
+                    if not ends_with_sil:
+                        pron_ext.add(pron+" "+" ".join(nonsil_left))
+                if isinstance(info[-1][2], set):
+                    info[-1][2] = info[-1][2].union(pron_ext)
+            if len(nonsil_right) > 0:
+                info.append([utt, word, " ".join(nonsil_right)])
+        else:
+            prons = set()
+            prons.add(" ".join(phones))
+            # If there's a preceding <eps>/non_scored_words (which means the third field is a string rather than a set of strings),
+            # we append it's nonsil_right segment to the pron candidates of the current word.
+            if len(info) > 0 and utt == info[-1][0] and isinstance(info[-1][2], str) and (phones == [] or phones[0] not in silphones):
+                # info[-1][2] is the nonsil_right segment of the phones aligned to the last <eps>/non_scored_words.
+                prons.add(info[-1][2]+' '+" ".join(phones))
+            info.append([utt, word, prons])
+    stats = {}
+    for utt, word, prons in info:
+        # If the prons is not a set, the current word must be <eps> or an non_scored_word,
+        # where we just left the nonsil_right part as prons.
+        if isinstance(prons, set) and len(prons) > 0:
+            count = 1.0 / float(len(prons))
+            for pron in prons:
+                phones = pron.strip().split()
+                # post-processing: remove all begining/trailing silence phones.
+                # we allow only candidates that either consist of a single silence
+                # phone, or the silence phones are inside non-silence phones.
+                if len(phones) > 1:
+                    begin = 0
+                    for phone in phones:
+                        if phone in silphones:
+                            begin += 1
+                        else:
+                            break
+                    if begin == len(phones):
+                        begin -= 1
+                    phones = phones[begin:]
+                    if len(phones) == 1:
+                        break
+                    end = len(phones)
+                    for phone in reversed(phones):
+                        if phone in silphones:
+                            end -= 1
+                        else:
+                            break
+                    phones = phones[:end]
+                phones = " ".join(phones)
+                stats[(word, phones)] = stats.get((word, phones), 0) + count
+    return stats
+
+def WriteStats(stats, file_handle):            
+    for word_pron, count in stats.iteritems():
+        print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+    silphones = ReadEntries(args.silence_file_handle)
+    non_scored_words = ReadEntries(args.non_scored_words_file_handle)
+    optional_silence = ReadEntries(args.optional_silence_file_handle)
+    stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
+    WriteStats(stats, args.stats_file_handle)            
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/apply_g2p.sh b/egs/wsj/s5/steps/dict/apply_g2p.sh
new file mode 100755
index 00000000000..1f66c838010
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_g2p.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0
+
+# Begin configuration section.  
+stage=0
+encoding='utf-8'
+var_counts=3  #Generate upto N variants
+var_mass=0.9  #Generate so many variants to produce 90 % of the prob mass
+cmd=run.pl
+nj=10          #Split the task into several parallel, to speedup things
+model=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
+   echo "... where <word-list> is a list of words whose pronunciation is to be generated"
+   echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
+   echo "          <output-dir> is the directory where the output lexicon should be stored"
+   echo "e.g.: $0 oov_words exp/g2p exp/g2p/oov_lex"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <int>                                    # How many tasks should be spawn (to speedup things)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+wordlist=$1
+modeldir=$2
+output=$3
+
+
+mkdir -p $output/log
+
+model=$modeldir/g2p.model.final
+[ ! -f ${model:-} ] && echo "File $model not found in the directory $modeldir." && exit 1
+#[ ! -x $wordlist ] && echo "File $wordlist not found!" && exit 1
+
+cp $wordlist $output/wordlist.txt
+
+if ! g2p=`which g2p.py` ; then
+  echo "The Sequitur was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
+  exit 1
+fi
+
+echo "Applying the G2P model to wordlist $wordlist"
+
+if [ $stage -le 0 ]; then
+  $cmd JOBS=1:$nj $output/log/apply.JOBS.log \
+    split -n l/JOBS/$nj $output/wordlist.txt \| \
+    g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \
+      --model $modeldir/g2p.model.final --apply - \
+    \> $output/output.JOBS
+fi
+cat $output/output.* > $output/output
+
+# Remap the words from output file back to the original casing
+# Conversion of some of thems might have failed, so we have to be careful
+# and use the transform_map file we generated beforehand
+# Also, because the sequitur output is not readily usable as lexicon (it adds 
+# one more column with ordering of the pron. variants) convert it into the proper lexicon form
+output_lex=$output/lexicon.lex
+
+# Just convert it to a proper lexicon format
+cut -f 1,3,4 $output/output > $output_lex
+
+# Some words might have been removed or skipped during the process,
+# let's check it and warn the user if so...
+nlex=`cut -f 1 $output_lex | sort -u | wc -l`
+nwlist=`cut -f 1 $output/wordlist.txt | sort -u | wc -l`
+if [ $nlex -ne $nwlist ] ; then
+  echo "WARNING: Unable to generate pronunciation for all words. ";
+  echo "WARINNG:   Wordlist: $nwlist words"
+  echo "WARNING:   Lexicon : $nlex words"
+  echo "WARNING:Diff example: "
+  diff <(cut -f 1 $output_lex | sort -u ) \
+       <(cut -f 1 $output/wordlist.txt | sort -u ) || true
+fi
+exit 0
diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
new file mode 100755
index 00000000000..a5bdbc30d46
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+import argparse
+import sys
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
+                                     "to produce a learned lexicon.",
+                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+
+    parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
+                        help = "Input lexicon. Each line must be <word> <phones>.")
+    parser.add_argument("lexicon_edits_file", metavar='<lexicon-edits-file>', type = str,
+                        help = "Input lexicon edits file containing human-readable & editable"
+                               "pronounciation info.  The info for each word is like:"
+                         "------------ an 4086.0 --------------"
+                         "R  | Y |  2401.6 |  AH N"
+                         "R  | Y |  640.8 |  AE N"
+                         "P  | Y |  1035.5 |  IH N"
+                         "R(ef), P(hone-decoding) represents the pronunciation source"
+                         "Y/N means the recommended decision of including this pron or not"
+                         "and the numbers are soft counts accumulated from lattice-align-word outputs. See steps/dict/select_prons_bayesian.py for more details.")
+    parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
+                        help = "Output lexicon to this file.")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.in_lexicon == "-":
+        args.in_lexicon = sys.stdin
+    else:
+        args.in_lexicon_handle = open(args.in_lexicon)
+    args.lexicon_edits_file_handle = open(args.lexicon_edits_file)
+
+    if args.out_lexicon == "-":
+        args.out_lexicon_handle = sys.stdout
+    else:
+        args.out_lexicon_handle = open(args.out_lexicon, "w")
+
+    return args
+
+def ReadLexicon(lexicon_file_handle):
+    lexicon = set()
+    if lexicon_file_handle:
+        for line in lexicon_file_handle.readlines():
+            splits = line.strip().split()
+            if len(splits) == 0:
+                continue
+            if len(splits) < 2:
+                raise Exception('Invalid format of line ' + line
+                                    + ' in lexicon file.')
+            word = splits[0]
+            phones = ' '.join(splits[1:])
+            lexicon.add((word, phones))
+    return lexicon
+
+def ApplyLexiconEdits(lexicon, lexicon_edits_file_handle):
+    if lexicon_edits_file_handle:
+        for line in lexicon_edits_file_handle.readlines():
+            # skip all commented lines
+            if line.startswith('#'):
+                continue
+            # read a word from a line like "---- MICROPHONES 200.0 ----".
+            if line.startswith('---'):
+                splits = line.strip().strip('-').strip().split()
+                if len(splits) != 2:
+                    print(splits, file=sys.stderr)
+                    raise Exception('Invalid format of line ' + line
+                                        + ' in lexicon edits file.')
+                word = splits[0].strip()
+            else:
+            # parse the pron and decision 'Y/N' of accepting the pron or not,
+            # from a line like: 'P  | Y |  42.0 |  M AY K R AH F OW N Z'
+                splits = line.split('|')
+                if len(splits) != 4:
+                    raise Exception('Invalid format of line ' + line
+                                        + ' in lexicon edits file.')
+                pron = splits[3].strip()
+                if splits[1].strip() == 'Y':
+                    lexicon.add((word, pron))
+                elif splits[1].strip() == 'N':
+                    lexicon.discard((word, pron))
+                else:
+                    raise Exception('Invalid format of line ' + line
+                                        + ' in lexicon edits file.')
+    return lexicon
+
+
+def WriteLexicon(lexicon, out_lexicon_handle):
+    for word, pron in lexicon:
+        print('{0} {1}'.format(word, pron), file=out_lexicon_handle)
+    out_lexicon_handle.close()
+
+def Main():
+    args = GetArgs()
+    lexicon = ReadLexicon(args.in_lexicon_handle)
+    ApplyLexiconEdits(lexicon, args.lexicon_edits_file_handle)
+    WriteLexicon(lexicon, args.out_lexicon_handle)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py
new file mode 100755
index 00000000000..b5202a69abb
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/get_pron_stats.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+#           2016  Vimal Manohar
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon"
+                                     "learning. The inputs are a file containing arc level information from lattice-align-words,"
+                                     "and a map which maps word-position-dependent phones to word-position-independent phones"
+                                     "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
+                                     "of pronunciations",
+                                     epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
+                                              "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
+                                              "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
+                                              "See steps/dict/learn_lexicon.sh for examples in detail.")
+
+    parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
+                        help = "Input file containing per arc statistics; "
+                        "each line must be <counts> <word> <phones>")
+    parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
+                        help = "An input phone map used to remove word boundary markers from phones;"
+                        "generated in steps/cleanup/debug_lexicon.sh")
+    parser.add_argument("stats_file", metavar = "<stats_file>", type = str,
+                        help = "Write accumulated statitistics to this file;"
+                        "each line is <count> <word> <phones>")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.arc_info_file == "-":
+        args.arc_info_file_handle = sys.stdin
+    else:
+        args.arc_info_file_handle = open(args.arc_info_file)
+    args.phone_map_handle = open(args.phone_map)
+
+    if args.stats_file == "-":
+        args.stats_file_handle = sys.stdout
+    else:
+        args.stats_file_handle = open(args.stats_file, "w")
+
+    return args
+
+
+def GetStatsFromArcInfo(arc_info_file_handle, phone_map_handle):
+    prons = defaultdict(set)
+    # need to map the phones to remove word boundary markers.
+    phone_map = {}
+    stats_unmapped = {} 
+    stats = {} 
+    for line in phone_map_handle.readlines():
+        splits = line.strip().split()
+        phone_map[splits[0]] = splits[1]
+
+    for line in arc_info_file_handle.readlines():
+        splits = line.strip().split()
+        if (len(splits) == 0):
+            continue
+        if (len(splits) < 6):
+            raise Exception('Invalid format of line ' + line
+                                + ' in arc_info_file')
+        word = splits[4]
+        count = float(splits[3])
+        phones = " ".join(splits[5:])        
+        prons[word].add(phones)
+        stats_unmapped[(word, phones)] = stats_unmapped.get((word, phones), 0) + count
+     
+    for word_pron, count in stats_unmapped.iteritems():
+        phones_unmapped = word_pron[1].split()
+        phones = [phone_map[phone] for phone in phones_unmapped]
+        stats[(word_pron[0], " ".join(phones))] = count
+    return stats
+
+def WriteStats(stats, file_handle):
+    for word_pron, count in stats.iteritems():
+        print('{2} {0} {1}'.format(word_pron[0], word_pron[1], count),
+              file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+    stats = GetStatsFromArcInfo(args.arc_info_file_handle, args.phone_map_handle)
+    WriteStats(stats, args.stats_file_handle)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
new file mode 100755
index 00000000000..1f2863424f3
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
+                                     "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
+                                     "cadidates according to their soft-counts, and then select the top r * N candidates"
+                                     "(For words in the reference lexicon, N = # pron variants given by the reference"
+                                     "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
+                                     "r is a user-specified constant, like 2.",
+                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+
+    parser.add_argument("--r", type = float, default = "2.0",
+                        help = "a user-specified ratio parameter which determines how many"
+                        "pronunciation candidates we want to keep for each word.")
+    parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
+                        help = "File containing soft-counts of all pronounciation candidates; "
+                        "each line must be <soft-counts> <word> <phones>")
+    parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
+                        help = "Reference lexicon file, where we obtain # pron variants for"
+                        "each word, based on which we prune the pron candidates."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pruned_prons", metavar = "<pruned-prons>", type = str,
+                        help = "An output file in lexicon format, which contains prons we want to" 
+                        "prune off from the pron_stats file.")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    args.pron_stats_handle = open(args.pron_stats)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    if args.pruned_prons == "-":
+        args.pruned_prons_handle = sys.stdout
+    else:
+        args.pruned_prons_handle = open(args.pruned_prons, "w")
+    return args
+
+def ReadStats(pron_stats_handle):
+    stats = defaultdict(list)
+    for line in pron_stats_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in stats file.')
+        count = float(splits[0])
+        word = splits[1]
+        phones = ' '.join(splits[2:])
+        stats[word].append((phones, count))
+
+    for word, entry in stats.iteritems():
+        entry.sort(key=lambda x: x[1])
+    return stats
+
+def ReadLexicon(ref_lexicon_handle):
+    ref_lexicon = defaultdict(set)
+    for line in ref_lexicon_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        phones = ' '.join(splits[1:])
+        ref_lexicon[word].add(phones)
+    return ref_lexicon
+
+def PruneProns(args, stats, ref_lexicon):
+    # Compute the average # pron variants counts per word in the reference lexicon.
+    num_words_ref = 0
+    num_prons_ref = 0
+    for word, prons in ref_lexicon.iteritems():
+        num_words_ref += 1
+        num_prons_ref += len(prons)
+    avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))
+
+    for word, entry in stats.iteritems():
+        if word in ref_lexicon:
+            variants_counts = args.r * len(ref_lexicon[word])
+        else:
+            variants_counts = args.r * avg_variants_counts_ref
+        num_variants = 0
+        while num_variants < variants_counts:
+            try:
+                pron, prob = entry.pop()
+                if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                    num_variants += 1
+            except IndexError:
+                break
+        
+    for word, entry in stats.iteritems():
+        for pron, prob in entry:
+            if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)
+
+def Main():
+    args = GetArgs()
+    ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
+    stats = ReadStats(args.pron_stats_handle)
+    PruneProns(args, stats, ref_lexicon)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon.sh
new file mode 100755
index 00000000000..7f32428c059
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/learn_lexicon.sh
@@ -0,0 +1,410 @@
+#! /bin/bash
+
+# Copyright 2016  Xiaohui Zhang
+#           2016  Vimal Manohar
+# Apache 2.0
+
+# This script demonstrate how to expand a existing lexicon using a combination
+# of acoustic evidence and G2P to learn a lexicon that covers words in a target 
+# vocab, and agrees sufficiently with the acoustics. The basic idea is to 
+# run phonetic decoding on acoustic training data using an existing
+# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get 
+# alternative pronunciations for words in training data. Then we combine three
+# exclusive sources of pronunciations: the reference lexicon (supposedly 
+# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run 
+# lattice alignment on the same data, to collect acoustic evidence (soft
+# counts) of all pronunciations. Based on these statistics, and
+# user-specified prior-counts (parameterized by prior mean and prior-counts-tot,
+# assuming the prior follows a Dirichlet distribution), we then use a Bayesian
+# framework to compute posteriors of all pronunciations for each word,
+# and then select best pronunciations for each word. The output is a final learned lexicon
+# whose vocab matches the user-specified target-vocab, and two intermediate resultis:
+# an edits file which records the recommended changes to all in-ref-vocab words'
+# prons, and a half-learned lexicon where all in-ref-vocab words' prons were untouched
+# (on top of which we apply the edits file to produce the final learned lexicon).
+# The user can always modify the edits file manually and then re-apply it on the
+# half-learned lexicon using steps/dict/apply_lexicon_edits to produce the final
+# learned lexicon. See the last stage in this script for details.
+
+
+stage=0
+# Begin configuration section.  
+cmd=run.pl
+nj=
+stage=6
+oov_symbol=
+lexicon_g2p=
+min_prob=0.3
+variants_prob_mass=0.7
+variants_prob_mass_ref=0.9
+prior_counts_tot=15
+prior_mean="0.7,0.2,0.1"
+num_gauss=
+num_leaves=
+retrain_src_mdl=true
+cleanup=true
+# End configuration section.  
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 7 ]; then
+  echo "Usage: $0 [options] <ref-dict> <target-vocab> <data> <src-mdl-dir> \\"
+  echo "          <ref-lang> <dest-dict>."
+  echo "  This script does lexicon expansion using a combination of acoustic"
+  echo "  evidence and G2P to produce a lexicon that covers words of a target vocab:"
+  echo ""               
+  echo "Arguments:"
+  echo " <ref-dict>     the dir which contains the reference lexicon (most probably hand-derived)"
+  echo "                we want to expand/improve, and nonsilence_phones.txt,.etc which we need " 
+  echo "                for building new dict dirs."
+  echo " <target-vocab> the vocabulary we want the final learned lexicon to cover (one word per line)."
+  echo " <data>         acoustic training data we use to get alternative"
+  echo "                pronunciations and collet acoustic evidence."
+  echo " <src-mdl-dir>  The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" 
+  echo "                using G2P expanded lexicon) to do phonetic decoding (to get alternative"
+  echo "                pronunciations) and lattice-alignment (to collect acoustic evidence for"
+  echo "                evaluating all prounciations)"
+  echo " <ref-lang>     the reference lang dir which we use to get non-scored-words"
+  echo "                like <UNK> for building new dict dirs"
+  echo " <dest-dict>    the dict dir where we put the final learned lexicon, whose vocab"
+  echo "                matches <target-vocab>."
+  echo ""
+  echo "Note: <target-vocab> and the vocab of <data> don't have to match. For words"
+  echo "     who are in <target-vocab> but not seen in <data>, their pronunciations" 
+  echo "     will be given by G2P at the end."
+  echo ""
+  echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\"
+  echo "          exp/tri3 data/lang data/local/dict_learned"
+  echo "Options:"
+  echo "  --stage <n>                  # stage to run from, to enable resuming from partially"
+  echo "                               # completed run (default: 0)"
+  echo "  --cmd '$cmd'                 # command to submit jobs with (e.g. run.pl, queue.pl)"
+  echo "  --nj <nj>                    # number of parallel jobs"
+  echo "  --oov-symbol '$oov_symbol'   # oov symbol, like <UNK>."
+  echo "  --g2p-pron-candidates        # A lexicon file containing g2p generated pronunciations, for words in acoustic training "
+  echo "                               # data / target vocabulary. It's optional."
+  echo "  --min-prob <float>           # The cut-off parameter used to select pronunciation candidates from phonetic"
+  echo "                               # decoding. We remove pronunciations with probabilities less than this value"
+  echo "                               # after normalizing the probs s.t. the max-prob is 1.0 for each word."
+  echo "  --prior-mean                 # Mean of priors (summing up to 1) assigned to three exclusive pronunciation"
+  echo "         <float,float,float>   # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian"
+  echo "                               # pronunciation selection procedure). We recommend setting a larger prior"
+  echo "                               # mean for the reference lexicon, e.g. '0.6,0.2,0.2'."
+  echo "  --prior-counts-tot <float>   # Total amount of prior counts we add to all pronunciation candidates of"
+  echo "                               # each word. By timing it with the prior mean of a source, and then dividing"
+  echo "                               # by the number of candidates (for a word) from this source, we get the"
+  echo "                               # prior counts we actually add to each candidate."
+  echo "  --variants-prob-mass <float> # In the Bayesian pronunciation selection procedure, for each word, we"
+  echo "                               # choose candidates (from all three sources) with highest posteriors"
+  echo "                               # until the total prob mass hit this amount."
+  echo "                               # It's used in a similar fashion when we apply G2P."
+  echo "  --variants-prob-mass-ref     # In the Bayesian pronunciation selection procedure, for each word,"
+  echo "                               # after the total prob mass of selected candidates hit variants-prob-mass,"
+  echo "                               # we continue to pick up reference candidates with highest posteriors"
+  echo "                               # until the total prob mass hit this amount (must >= variants-prob-mass)."
+  echo "  --num-gauss                  # number of gaussians for the re-trained SAT model (on top of <src-mdl-dir>)."            
+  echo "  --num-leaves                 # number of leaves for the re-trained SAT model (on top of <src-mdl-dir>)." 
+  echo "  --retrain-src-mdl            # true if you want to re-train the src_mdl before phone decoding (default false)."
+  exit 1
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+ref_dict=$1
+target_vocab=$2
+data=$3
+src_mdl_dir=$4
+ref_lang=$5
+dest_dict=$6
+dir=$7 # Most intermediate outputs will be put here. 
+
+mkdir -p $dir
+if [ $stage -le 0 ]; then
+  echo "$0: Some preparatory work."
+  # Get the word counts of training data.
+  awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \
+    $data/text | sort > $dir/train_counts.txt
+  
+  # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab.
+  steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words
+  awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt > $dir/non_scored_entries 
+
+  # Remove non-scored-words from the reference lexicon.
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt
+
+  cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $target_vocab | sort | uniq > $dir/target_vocab.txt
+    
+  # From the reference lexicon, we estimate the target_num_prons_per_word as,
+  # ceiling(avg. # prons per word in the reference lexicon). This'll be used as 
+  # the upper bound of # pron variants per word when we apply G2P or select prons to
+  # construct the learned lexicon in later stages.
+  python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \
+    `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
+    > $dir/target_num_prons_per_word || exit 1;
+
+  if [ -z $lexicon_g2p ]; then
+    # create an empty list of g2p generated prons, if it's not given.
+    touch $dir/lexicon_g2p.txt
+  else
+    cp $lexicon_g2p $dir/lexicon_g2p.txt 2>/dev/null
+  fi
+fi
+
+if [ $stage -le 1 ] && $retrain_src_mdl; then
+  echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then"
+  echo "   ... re-train the source acoustic model for phonetic decoding. "
+  mkdir -p $dir/dict_expanded_target_vocab
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_target_vocab  2>/dev/null
+  rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null
+  
+  # Get the oov words list (w.r.t ref vocab) which are in the target vocab. 
+  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt
+
+  # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which
+  # cannot be found in lexicon_g2p.txt, we simply ignore them.
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \
+    $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt
+  
+  cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \
+    cat $dir/non_scored_entries - | 
+    sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \
+    $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1;
+  
+  # Align the acoustic training data using the given src_mdl_dir.
+  alidir=${src_mdl_dir}_ali_$(basename $data) 
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1;
+  
+  # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained
+  # this model will be used for phonetic decoding and lattice alignment later on.
+  if [ -z $num_leaves ] || [ -z $num_gauss ] ; then
+    echo "num_leaves and num_gauss need to be specified." && exit 1;
+  fi
+  steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \
+    $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Expand the reference lexicon to cover all words seen in,"
+  echo "  ... acoustic training data, and prepare corresponding dict and lang directories."
+  echo "  ... This is needed when generate pron candidates from phonetic decoding."
+  mkdir -p $dir/dict_expanded_train
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_train 2>/dev/null
+  rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null
+
+  # Get the oov words list (w.r.t ref vocab) which are in training data. 
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
+    $dir/train_counts.txt | sort > $dir/oov_train.txt 
+  
+  awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
+    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate
+  
+  echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
+  cat $dir/train_oov_rate
+
+  # Assign pronunciations from lexicon_g2p to oov_train. For words which
+  # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
+  # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
+    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt
+  
+  # Get the pronunciation of oov_symbol.
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '`
+  # For oov words in training data for which we don't even have G2P pron candidates,
+  # we simply assign them the pronunciation of the oov symbol (like <unk>).
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
+    $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt
+    
+  cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat - $dir/non_scored_entries | \
+    sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1;
+  
+  utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \
+    $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.."
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/cleanup/debug_lexicon.sh --nj $nj --cmd "$decode_cmd" $data $dir/lang_expanded_train \
+    $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1;
+  
+  # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
+  # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
+  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt 
+  
+  $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
+    --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
+    $dir/phonetic_decoding/prons.txt $dir/lexicon_phonetic_decoding_with_eps.txt
+  cat $dir/lexicon_phonetic_decoding_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
+    sort | uniq > $dir/lexicon_phonetic_decoding.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one"
+  echo "  ... lexicon, and run lattice alignment using this lexicon on acoustic training data"
+  echo "  ... to collect acoustic evidence."
+  # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon.
+  mkdir -p $dir/dict_combined_iter1
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter1/ 2>/dev/null
+  rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null
+
+  # Filter out words which don't appear in the acoustic training data
+  cat $dir/lexicon_phonetic_decoding.txt $dir/lexicon_g2p.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter1/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter1 $oov_symbol \
+    $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1;
+  
+  # Generate lattices for the acoustic training data with the combined lexicon.
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \
+    $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;
+
+  # Get arc level information from the lattice.
+  $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \
+    $dir/lats_iter1/final.mdl \
+    "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \
+    $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1;
+  
+  # Get soft counts of all pronunciations from arc level information.
+  cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment."
+  mkdir -p $dir/dict_combined_iter2
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter2/ 2>/dev/null
+  rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null
+
+  # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
+  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt
+ 
+  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \
+    > $dir/lexicon_phonetic_decoding_pruned.txt
+
+  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \
+    > $dir/lexicon_g2p_pruned.txt \
+
+  # Filter out words which don't appear in the acoustic training data
+  cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter2/lexicon.txt
+
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter2 $oov_symbol \
+    $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1;
+  
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \
+    $data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1;
+
+  # Get arc level information from the lattice.
+  $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \
+    $dir/lats_iter2/final.mdl \
+    "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \
+    $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1;
+  
+  # Get soft counts of all pronunciations from arc level information.
+  cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment."
+  # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations 
+  # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding.
+  # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt
+  # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt.
+  # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided).
+  # For words in the ref. vocab, we instead output a human readable & editable "edits" file called
+  # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a 
+  # summary is printed into the log file.
+  
+  variants_counts=`cat $dir/target_num_prons_per_word` || exit 1;
+  $cmd $dir/lats_iter2/log/select_prons_bayesian.log \
+    steps/dict/select_prons_bayesian.py --prior-mean=$prior_mean --prior-counts-tot=$prior_counts_tot \
+    --variants-counts=$variants_counts --variants-prob-mass=$variants_prob_mass --variants-prob-mass-ref=$variants_prob_mass_ref \
+    $ref_dict/silence_phones.txt $dir/lats_iter2/pron_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
+    $dir/lexicon_g2p_pruned.txt $dir/lexicon_phonetic_decoding_pruned.txt \
+    $dir/lats_iter2/pron_posteriors.temp $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt
+
+  # We reformat the pron_posterior file and add some comments.
+  paste <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f1-3 | column -t) \
+    <(cat $dir/lats_iter2/pron_posteriors.temp | cut -d' ' -f4-) | sort -nr -k1,3 | \
+    cat <( echo ';; <word> <source: R(eference)/G(2P)/P(hone-decoding)> <posterior> <pronunciation>') -  \
+    > $dir/lats_iter2/pron_posteriors.txt
+  rm $dir/pron_posteriors.temp 2>/dev/null
+
+  # Remove some stuff that takes up space and is unlikely to be useful later on.
+  if $cleanup; then
+    rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null
+  fi
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Expand the learned lexicon further to cover words in target vocab that are."
+  echo "  ... not seen in acoustic training data."
+  mkdir -p $dest_dict
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dest_dict  2>/dev/null
+  rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null
+  # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the
+  # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
+  cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt
+
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
+    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt
+ 
+  # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics,
+  # learned lexicon for oov words with acoustics, and the original reference lexicon (for
+  # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py
+  cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp
+
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \
+    $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil
+
+  cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: Apply the ref_lexicon_edits file to the reference lexicon."
+  echo "  ... The user can inspect/modify the edits file and then re-run:"
+  echo "  ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt  - | \\"
+  echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
+  cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
+  steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
+    sort | uniq > $dest_dict/lexicon.txt
+fi
diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
new file mode 100755
index 00000000000..2a87d172602
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Vimal Manohar
+#           2016  Xiaohui Zhang
+# Apache 2.0.
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import argparse
+import sys
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) "
+                                     "into a lexicon for lexicon learning. We prune the pronunciations "
+                                     "based on a provided stats file, and optionally filter out entries which are present "
+                                     "in a filter lexicon.",
+                                     epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\"
+                                     "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\"
+                                     "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\"
+                                     "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt"
+                                     "See steps/dict/learn_lexicon.sh for examples in detail.")
+
+    parser.add_argument("--set-sum-to-one", type = str, default = False,
+                        action = StrToBoolAction, choices = ["true", "false"],
+                        help = "If normalize lexicon such that the sum of "
+                        "probabilities is 1.")
+    parser.add_argument("--set-max-to-one", type = str, default = True,
+                        action = StrToBoolAction, choices = ["true", "false"],
+                        help = "If normalize lexicon such that the max "
+                        "probability is 1.")
+    parser.add_argument("--min-prob", type = float, default = 0.1,
+                        help = "Remove pronunciation with probabilities less "
+                        "than this value after normalization.")
+    parser.add_argument("--filter-lexicon", metavar='<filter-lexicon>', type = str, default = '',
+                        help = "Exclude entries in this filter lexicon from the output lexicon."
+                        "each line must be <word> <phones>")
+    parser.add_argument("stats_file", metavar='<stats-file>', type = str,
+                        help = "Input file containing pronunciation statistics, representing how many times "
+                        "each word-pronunciation appear in the phonetic decoding results."
+                        "each line must be <counts> <word> <phones>")
+    parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
+                        help = "Output lexicon.")
+
+    print (' '.join(sys.argv), file = sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.stats_file == "-":
+        args.stats_file_handle = sys.stdin
+    else:
+        args.stats_file_handle = open(args.stats_file)
+
+    if args.filter_lexicon is not '':
+        if args.filter_lexicon == "-":
+            args.filter_lexicon_handle = sys.stdout
+        else:
+            args.filter_lexicon_handle = open(args.filter_lexicon)
+    
+    if args.out_lexicon == "-":
+        args.out_lexicon_handle = sys.stdout
+    else:
+        args.out_lexicon_handle = open(args.out_lexicon, "w")
+
+    if args.set_max_to_one == args.set_sum_to_one:
+        raise Exception("Cannot have both "
+            "set-max-to-one and set-sum-to-one as true or false.")
+
+    return args
+
+def ReadStats(args):
+    lexicon = {}
+    word_count = {}
+    for line in args.stats_file_handle:
+        splits = line.strip().split()
+        if len(splits) < 3:
+            continue
+
+        word = splits[1]
+        count = float(splits[0])
+        phones = ' '.join(splits[2:])
+
+        lexicon[(word, phones)] = lexicon.get((word, phones), 0) + count
+        word_count[word] = word_count.get(word, 0) + count
+
+    return [lexicon, word_count]
+
+def ReadLexicon(lexicon_file_handle):
+    lexicon = set()
+    if lexicon_file_handle:
+        for line in lexicon_file_handle.readlines():
+            splits = line.strip().split()
+            if len(splits) == 0:
+                continue
+            if len(splits) < 2:
+                raise Exception('Invalid format of line ' + line
+                                    + ' in lexicon file.')
+            word = splits[0]
+            phones = ' '.join(splits[1:])
+            lexicon.add((word, phones))
+    return lexicon
+
+def ConvertWordCountsToProbs(args, lexicon, word_count):
+    word_probs = {}
+    for entry, count in lexicon.iteritems():
+        word = entry[0]
+        phones = entry[1]
+        prob = float(count) / float(word_count[word])
+        if word in word_probs:
+            word_probs[word].append((phones, prob))
+        else:
+            word_probs[word] = [(phones, prob)]
+
+    return word_probs
+
+def ConvertWordProbsToLexicon(word_probs):
+    lexicon = {}
+    for word, entry in word_probs.iteritems():
+        for x in entry:
+            lexicon[(word, x[0])] = lexicon.get((word,x[0]), 0) + x[1]
+    return lexicon
+
+def NormalizeLexicon(lexicon, set_max_to_one = True,
+                     set_sum_to_one = False, min_prob = 0):
+    word_probs = {}
+    for entry, prob in lexicon.iteritems():
+        t = word_probs.get(entry[0], (0,0))
+        word_probs[entry[0]] = (t[0] + prob, max(t[1], prob))
+
+    for entry, prob in lexicon.iteritems():
+        if set_max_to_one:
+            prob = prob / word_probs[entry[0]][1]
+        elif set_sum_to_one:
+            prob = prob / word_probs[entry[0]][0]
+        if prob < min_prob:
+            prob = 0
+        lexicon[entry] = prob
+
+def WriteLexicon(args, lexicon, filter_lexicon):
+    words = set()
+    num_removed = 0
+    num_filtered = 0
+    for entry, prob in lexicon.iteritems():
+        if prob == 0:
+            num_removed += 1
+            continue
+        if entry in filter_lexicon:
+            num_filtered += 1
+            continue
+        words.add(entry[0])
+        print("{0} {1}".format(entry[0], entry[1]),
+                file = args.out_lexicon_handle)
+    print ("Before pruning, the total num. pronunciations is: {}".format(len(lexicon)), file=sys.stderr)
+    print ("Removed {0} pronunciations by setting min_prob {1}".format(num_removed, args.min_prob), file=sys.stderr)
+    print ("Filtered out {} pronunciations in the filter lexicon.".format(num_filtered), file=sys.stderr)
+    num_prons_from_phone_decoding = len(lexicon) - num_removed - num_filtered
+    print ("Num. pronunciations in the output lexicon, which solely come from phone decoding"
+           "is {0}. num. words is {1}".format(num_prons_from_phone_decoding, len(words)), file=sys.stderr)
+
+def Main():
+    args = GetArgs()
+
+    [lexicon, word_count] = ReadStats(args)
+
+    word_probs = ConvertWordCountsToProbs(args, lexicon, word_count)
+
+    lexicon = ConvertWordProbsToLexicon(word_probs)
+    filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
+    NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
+                     set_sum_to_one = args.set_sum_to_one,
+                     min_prob = args.min_prob)
+    WriteLexicon(args, lexicon, filter_lexicon)
+    args.out_lexicon_handle.close()
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
new file mode 100755
index 00000000000..affc5b17705
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
+                                     "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
+                                     "cadidates according to their soft-counts, and then select the top r * N candidates"
+                                     "(For words in the reference lexicon, N = # pron variants given by the reference"
+                                     "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
+                                     "r is a user-specified constant, like 2.",
+                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+
+    parser.add_argument("--r", type = float, default = "2.0",
+                        help = "a user-specified ratio parameter which determines how many"
+                        "pronunciation candidates we want to keep for each word.")
+    parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
+                        help = "File containing soft-counts of all pronounciation candidates; "
+                        "each line must be <soft-counts> <word> <phones>")
+    parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
+                        help = "Reference lexicon file, where we obtain # pron variants for"
+                        "each word, based on which we prune the pron candidates.")
+    parser.add_argument("pruned_prons", metavar = "<pruned-prons>", type = str,
+                        help = "A file in lexicon format, which contains prons we want to" 
+                        "prune away from the pron_stats file.")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    args.pron_stats_handle = open(args.pron_stats)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    if args.pruned_prons == "-":
+        args.pruned_prons_handle = sys.stdout
+    else:
+        args.pruned_prons_handle = open(args.pruned_prons, "w")
+    return args
+
+def ReadStats(pron_stats_handle):
+    stats = defaultdict(list)
+    for line in pron_stats_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in stats file.')
+        count = float(splits[0])
+        word = splits[1]
+        phones = ' '.join(splits[2:])
+        stats[word].append((phones, count))
+
+    for word, entry in stats.iteritems():
+        entry.sort(key=lambda x: x[1])
+    return stats
+
+def ReadLexicon(ref_lexicon_handle):
+    ref_lexicon = defaultdict(set)
+    for line in ref_lexicon_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        try:
+            phones = ' '.join(splits[2:])
+        except ValueError:
+            phones = ' '.join(splits[1:])
+        ref_lexicon[word].add(phones)
+    return ref_lexicon
+
+def PruneProns(args, stats, ref_lexicon):
+    # Compute the average # pron variants counts per word in the reference lexicon.
+    num_words_ref = 0
+    num_prons_ref = 0
+    for word, prons in ref_lexicon.iteritems():
+        num_words_ref += 1
+        num_prons_ref += len(prons)
+    avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))
+
+    for word, entry in stats.iteritems():
+        if word in ref_lexicon:
+            variants_counts = args.r * len(ref_lexicon[word])
+        else:
+            variants_counts = args.r * avg_variants_counts_ref
+        num_variants = 0
+        while num_variants < variants_counts:
+            try:
+                pron, prob = entry.pop()
+                if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                    num_variants += 1
+            except IndexError:
+                break
+        
+    for word, entry in stats.iteritems():
+        for pron, prob in entry:
+            if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)
+
+def Main():
+    args = GetArgs()
+    ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
+    stats = ReadStats(args.pron_stats_handle)
+    PruneProns(args, stats, ref_lexicon)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
new file mode 100755
index 00000000000..e728a4af0b8
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python
+
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description = "Use a Bayesian framework to select"
+                                     "pronunciation candidates from three sources: reference lexicon"
+                                     ", G2P lexicon and phonetic-decoding lexicon. The inputs are a word-stats file,"
+                                     "a pron-stats file, and three source lexicons (ref/G2P/phonetic-decoding)."
+                                     "We assume the pronunciations for each word follow a Categorical distribution"
+                                     "with Dirichlet priors. Thus, with user-specified prior counts (parameterized by"
+                                     "prior-mean and prior-count-tot) and observed counts from the pron-stats file, "
+                                     "we can compute posterior for each pron, and select candidates with highest"
+                                     "posteriors, until we hit user-specified variants-prob-mass/counts thresholds."
+                                     "The outputs are: a file specifiying posteriors of all candidate (pron_posteriors),"
+                                     "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov),"
+                                     "and a lexicon_edits file containing suggested modifications of prons, for"
+                                     "words within the ref. vocab (ref_lexicon_edits).",
+                                     epilog = "See steps/dict/learn_lexicon.sh for example.")
+    parser.add_argument("--prior-mean", type = str, default = "0,0,0",
+                        help = "Mean of priors (summing up to 1) assigned to three exclusive n"
+                        "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We "
+                        "recommend setting a larger prior mean for the reference lexicon, e.g. '0.6,0.2,0.2'")
+    parser.add_argument("--prior-counts-tot", type = float, default = 15.0,
+                        help = "Total amount of prior counts we add to all pronunciation candidates of"
+                        "each word. By timing it with the prior mean of a source, and then dividing"
+                        "by the number of candidates (for a word) from this source, we get the"
+                        "prior counts we actually add to each candidate.")
+    parser.add_argument("--variants-prob-mass", type = float, default = 0.7,
+                        help = "For each word, we pick up candidates (from all three sources)"
+                        "with highest posteriors until the total prob mass hit this amount.")
+    parser.add_argument("--variants-prob-mass-ref", type = float, default = 0.9,
+                        help = "For each word, after the total prob mass of selected candidates "
+                        "hit variants-prob-mass, we continue to pick up reference candidates"
+                        "with highest posteriors until the total prob mass hit this amount (must >= variants-prob-mass).")
+    parser.add_argument("--variants-counts", type = int, default = 1,
+                        help = "Generate upto this many variants of prons for each word out"
+                        "of the ref. lexicon.")
+    parser.add_argument("silence_file", metavar = "<silphonetic-file>", type = str,
+                        help = "File containing a list of silence phones.")
+    parser.add_argument("pron_stats_file", metavar = "<stats-file>", type = str,
+                        help = "File containing pronunciation statistics from lattice alignment; "
+                        "each line must be <count> <word> <phones>.")
+    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
+                        help = "File containing word counts in acoustic training data; "
+                        "each line must be <word> <count>.")
+    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
+                        help = "The reference lexicon (most probably hand-derived)."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
+                        help = "Candidate ronouciations from G2P results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("phonetic_decoding_lexicon", metavar = "<prons-in-acoustic-evidence>", type = str,
+                        help = "Candidate ronouciations from phonetic decoding results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pron_posteriors", metavar = "<pron-posteriors>", type = str,
+                        help = "Output file containing posteriors of all candidate prons for each word,"
+                        "based on which we select prons to construct the learned lexicon."
+                        "each line is <word> <pronunciation-source: one of R(ef)/G(2P)/P(hone-decoding)> <posterior> <pronunciation> ")
+    parser.add_argument("learned_lexicon_oov", metavar = "<learned-lexicon-oov>", type = str,
+                        help = "Output file which is the learned lexicon for words out of the ref. vocab.")
+    parser.add_argument("ref_lexicon_edits", metavar = "<lexicon-edits>", type = str,
+                        help = "Output file containing human-readable & editable pronounciation info (and the"
+                        "accept/reject decision made by our algorithm) for those words in ref. vocab," 
+                        "to which any change has been recommended. The info for each word is like:" 
+                        "------------ an 4086.0 --------------"
+                        "R  | Y |  2401.6 |  AH N"
+                        "R  | Y |  640.8 |  AE N"
+                        "P  | Y |  1035.5 |  IH N"
+                        "R(ef), P(hone-decoding) represents the pronunciation source"
+                        "Y/N means the recommended decision of including this pron or not"
+                        "and the numbers are soft counts accumulated from lattice-align-word outputs. "
+                        "See the function WriteEditsAndSummary for more details.")
+
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    args.silence_file_handle = open(args.silence_file)
+    if args.pron_stats_file == "-":
+        args.pron_stats_file_handle = sys.stdin
+    else:
+        args.pron_stats_file_handle = open(args.pron_stats_file)
+    args.word_counts_file_handle = open(args.word_counts_file)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    args.g2p_lexicon_handle = open(args.g2p_lexicon)
+    args.phonetic_decoding_lexicon_handle = open(args.phonetic_decoding_lexicon)
+    args.pron_posteriors_handle = open(args.pron_posteriors, "w")
+    args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w")
+    args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w")
+    
+    prior_mean = args.prior_mean.strip().split(',')
+    if len(prior_mean) is not 3:
+        raise Exception('Invalid Dirichlet prior mean ', args.prior_mean)
+    for i in range(0,3):
+        if float(prior_mean[i]) <= 0 or float(prior_mean[i]) >= 1:
+            raise Exception('Dirichlet prior mean', prior_mean[i], 'is invalid, it must be between 0 and 1.')
+    args.prior_mean = [float(prior_mean[0]), float(prior_mean[1]), float(prior_mean[2])]
+
+    return args
+
+def ReadPronStats(pron_stats_file_handle):
+    stats = {}
+    for line in pron_stats_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in stats file.')
+        count = float(splits[0])
+        word = splits[1]
+        phones = ' '.join(splits[2:])
+        stats[(word, phones)] = count
+    return stats
+
+def ReadWordCounts(word_counts_file_handle):
+    counts = {}
+    for line in word_counts_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in counts file.')
+        word = splits[0]
+        count = int(splits[1])
+        counts[word] = count
+    return counts
+
+def ReadLexicon(args, lexicon_file_handle, counts):
+    # we're skipping any word not in counts (not seen in training data),
+    # cause we're only learning prons for words who have acoustic examples.
+    lexicon = defaultdict(set)
+    for line in lexicon_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        if word not in counts:
+            continue
+        phones = ' '.join(splits[1:])
+        lexicon[word].add(phones)
+    return lexicon
+
+def FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats):
+    # We want to remove all candidates which contains silence phones
+    silphones = set()
+    for line in args.silence_file_handle:
+        silphones.add(line.strip())
+    rejected_candidates = set()
+    for word, prons in phonetic_decoding_lexicon.iteritems():
+        for pron in prons:
+            for phone in pron.split():
+                if phone in silphones:
+                   if (word, pron) in stats:
+                       count = stats[(word, pron)]
+                       del stats[(word, pron)]
+                   else:
+                       count = 0
+                   rejected_candidates.add((word, pron))
+                   print('WARNING: removing the candidate pronunciation from phonetic-decoding: {0}: '
+                         '"{1}" whose soft-count from lattice-alignment is {2}, cause it contains at'
+                         ' least one silence phone.'.format(word, pron, count), file=sys.stderr)
+                   break
+    for word, pron in rejected_candidates:
+        phonetic_decoding_lexicon[word].remove(pron)
+    return phonetic_decoding_lexicon, stats
+
+def ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon):
+    prior_counts = defaultdict(list)
+    # In case one source is absent for a word, we set zero prior to this source, 
+    # and then re-normalize the prior mean parameters s.t. they sum up to one.
+    for word in counts:
+        prior_mean = [args.prior_mean[0], args.prior_mean[1], args.prior_mean[2]]
+        if word not in ref_lexicon:
+            prior_mean[0] = 0
+        if word not in g2p_lexicon:
+            prior_mean[1] = 0
+        if word not in phonetic_decoding_lexicon:
+            prior_mean[2] = 0
+        prior_mean_sum = sum(prior_mean)
+        try:
+            prior_mean = [t / prior_mean_sum for t in prior_mean] 
+        except ZeroDivisionError:
+            print('WARNING: word {} appears in train_counts but not in any lexicon.'.format(word), file=sys.stderr)
+        prior_counts[word] = [t * args.prior_counts_tot for t in prior_mean] 
+    return prior_counts
+
+def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts):
+    posteriors = defaultdict(list) # This dict stores a list of (pronunciation, posterior)
+    # pairs for each word, where the posteriors are normalized soft counts. Before normalization,
+    # The soft-counts were augmented by a user-specified prior count, according the source 
+    # (ref/G2P/phonetic-decoding) of this pronunciation.
+
+    for word, prons in ref_lexicon.iteritems():
+        for pron in prons:
+            # c is the augmented soft count (observed count + prior count)
+            c = prior_counts[word][0] / len(ref_lexicon[word]) + stats.get((word, pron), 0)
+            posteriors[word].append((pron, c))
+
+    for word, prons in g2p_lexicon.iteritems():
+        for pron in prons:
+            c = prior_counts[word][1] / len(g2p_lexicon[word]) + stats.get((word, pron), 0)
+            posteriors[word].append((pron, c))
+
+    for word, prons in phonetic_decoding_lexicon.iteritems():
+        for pron in prons:
+            c = prior_counts[word][2] / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0)
+            posteriors[word].append((pron, c))
+
+    num_prons_from_ref = sum(len(ref_lexicon[i]) for i in ref_lexicon)
+    num_prons_from_g2p = sum(len(g2p_lexicon[i]) for i in g2p_lexicon)
+    num_prons_from_phonetic_decoding = sum(len(phonetic_decoding_lexicon[i]) for i in phonetic_decoding_lexicon)
+    print ("---------------------------------------------------------------------------------------------------", file=sys.stderr)
+    print ('Total num. words is {}:'.format(len(posteriors)), file=sys.stderr)
+    print ('{0} candidate prons came from the reference lexicon; {1} came from G2P;{2} came from'
+           'phonetic_decoding'.format(num_prons_from_ref, num_prons_from_g2p, num_prons_from_phonetic_decoding), file=sys.stderr)
+    print ("---------------------------------------------------------------------------------------------------", file=sys.stderr)
+
+    # Normalize the augmented soft counts to get posteriors.
+    count_sum = defaultdict(float) # This dict stores the pronunciation which has 
+    # the sum of augmented soft counts for each word.
+    
+    for word in posteriors:
+        # each entry is a pair: (prounciation, count)
+        count_sum[word] = sum([entry[1] for entry in posteriors[word]])
+    
+    for word, entry in posteriors.iteritems():
+        new_entry = []
+        for pron, count in entry:      
+            post = count / count_sum[word]
+            new_entry.append((pron, post))
+            source = 'R'
+            if word in g2p_lexicon and pron in g2p_lexicon[word]:
+                source = 'G'
+            elif word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]:
+                source = 'P'
+            print(word, source, "%3.2f" % post, pron, file=args.pron_posteriors_handle)
+        del entry[:]
+        entry.extend(sorted(new_entry, key=lambda new_entry: new_entry[1]))
+    return posteriors
+
+def SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon):
+    reference_selected = 0
+    g2p_selected = 0
+    phonetic_decoding_selected = 0
+    learned_lexicon = defaultdict(set)
+
+    for word, entry in posteriors.iteritems():
+        num_variants = 0
+        post_tot = 0.0
+        variants_counts = args.variants_counts
+        variants_prob_mass = args.variants_prob_mass
+        if word in ref_lexicon:
+            # the variants count of the current word's prons in the ref lexicon.
+            variants_counts_ref = len(ref_lexicon[word])
+            # For words who don't appear in acoustic training data at all, we simply accept all ref prons.
+            # For words in ref. vocab, we set the max num. variants 
+            if counts.get(word, 0) > 0:
+                variants_counts = math.ceil(1.5 * variants_counts_ref)
+            else:
+                variants_counts = variants_counts_ref
+                variants_prob_mass = 1.0
+        last_post = 0.0
+        while ((num_variants < variants_counts and post_tot < variants_prob_mass)
+               or (len(entry) > 0 and entry[-1][1] == last_post)): # this conditions 
+               # means the posterior of the current pron is the same as the one we just included.
+            try:
+                pron, post = entry.pop()
+                last_post = post
+            except IndexError:
+                break
+            post_tot += post
+            learned_lexicon[word].add(pron)
+            num_variants += 1
+            if word in ref_lexicon and pron in ref_lexicon[word]:
+                reference_selected += 1
+            elif word in g2p_lexicon and pron in g2p_lexicon[word]:
+                g2p_selected += 1
+            else:
+                phonetic_decoding_selected += 1
+
+        while (num_variants < variants_counts and post_tot < args.variants_prob_mass_ref):
+            try:
+                pron, post = entry.pop()
+            except IndexError:
+                break
+            if word in ref_lexicon and pron in ref_lexicon[word]:
+                post_tot += post
+                learned_lexicon[word].add(pron)
+                num_variants += 1
+                reference_selected += 1
+
+    num_prons_tot = reference_selected + g2p_selected + phonetic_decoding_selected
+    print('---------------------------------------------------------------------------------------------------', file=sys.stderr)
+    print ('Num. words in the learned lexicon: {0} num. selected prons: {1}'.format(len(learned_lexicon), num_prons_tot), file=sys.stderr)
+    print ('{0} selected prons came from reference candidate prons; {1} came from G2P candidate prons;'
+           '{2} came from phonetic-decoding candidate prons.'.format(reference_selected, g2p_selected, phonetic_decoding_selected), file=sys.stderr) 
+    return learned_lexicon
+
+def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats):
+    # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs.
+    threshold = 3
+    words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we
+    # classify each word into, according to whether it's count > threshold,
+    # and whether it's OOVs w.r.t the reference lexicon.
+
+    src = {}
+    print("# Note: This file contains pronunciation info for words who have candidate"
+          "prons from G2P/phonetic-decoding accepted in the learned lexicon."
+          ", sorted by their counts in acoustic training data, "
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle)
+    print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle)
+    
+    # words which are to be printed into the edits file.
+    words_to_edit = [] 
+    for word in learned_lexicon:
+        count = counts.get(word, 0)
+        flags = ['0' for i in range(3)] # "flags" contains three binary indicators, 
+        # indicating where this word's pronunciations come from.
+        for pron in learned_lexicon[word]:
+            if word in phonetic_decoding_lexicon and pron in phonetic_decoding_lexicon[word]:
+                flags[0] = '1'
+                src[(word, pron)] = 'P'
+            if word in ref_lexicon and pron in ref_lexicon[word]:
+                flags[1] = '1'
+                src[(word, pron)] = 'R'
+            if word in g2p_lexicon and pron in g2p_lexicon[word]:
+                flags[2] = '1'
+                src[(word, pron)] = 'G'
+        if word in ref_lexicon:
+            all_ref_prons_accepted = True
+            for pron in ref_lexicon[word]:
+                if pron not in learned_lexicon[word]:
+                    all_ref_prons_accepted = False
+                    break
+            if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1':
+                words_to_edit.append((word, counts[word]))
+            if count > threshold:
+                words[0][flags[0] + flags[1] + flags[2]].add(word)
+            else:
+                words[1][flags[0] + flags[1] + flags[2]].add(word)
+        else:
+            if count > threshold: 
+                words[2][flags[0] + flags[2]].add(word)
+            else:
+                words[3][flags[0] + flags[2]].add(word)
+
+    words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True)
+    for word, count in words_to_edit_sorted:
+        print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle)
+        for pron in learned_lexicon[word]:
+            print(src[(word, pron)], ' | Y | ', "%2.1f | " % stats.get((word, pron), 0), pron, 
+                  file=args.ref_lexicon_edits_handle)
+        for pron in ref_lexicon[word]:
+            if pron not in learned_lexicon[word]:
+                soft_count = stats.get((word, pron), 0)
+                print('R  | N |  {:.2f} | {} '.format(soft_count, pron), file=args.ref_lexicon_edits_handle)
+    print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr)
+    print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr)
+    print("In the learned lexicon, out of those", len(ref_lexicon), "words from the vocab of the reference lexicon:", file=sys.stderr) 
+    print("  For those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) 
+    num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011'])
+    num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100'])
+    num_freq_ivs_from_ref = len(words[0]['010'])
+    num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011'])
+    num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100'])
+    num_infreq_ivs_from_ref = len(words[1]['010'])
+    print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr)
+    print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print(' {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr)
+    print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print(' {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) 
+    print("---------------------------------------------------------------------------------------------------", file=sys.stderr)
+    num_oovs = len(learned_lexicon) - len(ref_lexicon)
+    num_freq_oovs_from_both_sources = len(words[2]['11'])
+    num_freq_oovs_from_phonetic_decoding = len(words[2]['10'])
+    num_freq_oovs_from_g2p = len(words[2]['01'])
+    num_infreq_oovs_from_both_sources = len(words[3]['11'])
+    num_infreq_oovs_from_phonetic_decoding = len(words[3]['10'])
+    num_infreq_oovs_from_g2p = len(words[3]['01'])
+    print('  In the learned lexicon, out of those {} OOV words (w.r.t the reference lexicon):'.format(num_oovs), file=sys.stderr)
+    print('  For those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr)
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 
+
+def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
+    for word, prons in learned_lexicon.iteritems():
+        if word not in ref_lexicon:
+            for pron in prons:
+                print('{0} {1}'.format(word, pron), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+
+    # Read in three lexicon sources, word counts, and pron stats.
+    counts = ReadWordCounts(args.word_counts_file_handle)
+    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
+    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
+    phonetic_decoding_lexicon =  ReadLexicon(args, args.phonetic_decoding_lexicon_handle, counts)
+    stats = ReadPronStats(args.pron_stats_file_handle)
+    phonetic_decoding_lexicon, stats = FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats)
+   
+    # Compute prior counts
+    prior_counts = ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon)
+    # Compute posteriors, and then select prons to construct the learned lexicon.
+    posteriors = ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon, prior_counts)
+
+    # Select prons to construct the learned lexicon.
+    learned_lexicon = SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phonetic_decoding_lexicon)
+    
+    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
+    WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle)
+    # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr.
+    WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_lexicon, g2p_lexicon, counts, stats)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
new file mode 100755
index 00000000000..85e1605afba
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Xiaohui Zhang
+# Apache 2.0
+
+# Begin configuration section.  
+iters=5
+stage=0
+encoding='utf-8'
+only_words=true
+cmd=run.pl
+# a list of silence phones, like data/local/dict/silence_phones.txt
+silence_phones= 
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+   echo "Usage: $0 [options] <lexicon-in> <work-dir>"
+   echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+   echo "    word per line) and <word-dir> is directory where the models will "
+   echo "    be stored"
+   echo "e.g.: train_g2p.sh data/local/lexicon.txt exp/g2p/"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --iters <int>                                    # How many iterations. Relates to N-ngram order"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+
+mkdir -p $wdir/log
+
+[ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1
+
+# Optionally remove words that are mapped to a single silence phone from the lexicon.
+if $only_words && [ -z $silence_phones ]; then
+  awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \
+    $silence_phones > $wdir/lexicon_onlywords.txt
+  lexicon=$wdir/lexicon_onlywords.txt
+fi
+
+if ! g2p=`which g2p.py` ; then
+  echo "Sequitur was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
+  exit 1
+fi
+
+echo "Training the G2P model (iter 0)"
+
+if [ $stage -le 0 ]; then
+  $cmd $wdir/log/g2p.0.log \
+    g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0
+fi
+
+for i in `seq 0 $(($iters-2))`; do
+  
+  echo "Training the G2P model (iter $[$i + 1] )"
+
+  if [ $stage -le $i ]; then
+    $cmd $wdir/log/g2p.$(($i + 1)).log \
+      g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1))
+  fi
+
+done
+
+! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1
+
+if [ $stage -le $(($i + 2)) ]; then
+  echo "Running test..."
+  $cmd $wdir/log/test.log \
+    g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon
+fi
+
diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py
new file mode 100644
index 00000000000..2a472386568
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/__init__.py
@@ -0,0 +1,9 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+""" This package contains modules and subpackages used in kaldi scripts.
+"""
+
+__all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py
new file mode 100644
index 00000000000..03131a3a8d6
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/__init__.py
@@ -0,0 +1,12 @@
+
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vimal Manohar
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+
+# This module has the python functions which facilitate the use of nnet3 toolkit
+# It has two sub-modules
+# xconfig : Library for parsing high level description of neural networks
+# train : Library for training scripts
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py
new file mode 100644
index 00000000000..6c824b1195b
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+"""This library has classes and methods to form neural network computation graphs,
+in the nnet3 framework, using higher level abstractions called 'layers'
+(e.g. sub-graphs like LSTMS ).
+
+Note : We use the term 'layer' though the computation graph can have a highly
+non-linear structure as, other terms such as nodes/components have already been
+used in C++ codebase of nnet3.
+
+This is basically a config parser module, where the configs have very concise
+descriptions of a neural network.
+
+This module has methods to convert the xconfigs into a configs interpretable by
+nnet3 C++ library.
+
+It generates three different configs:
+ 'init.config' : which is the config with the info necessary for computing
+               the preconditioning matrix i.e., LDA transform
+               e.g.
+                 input-node name=input dim=40
+                 input-node name=ivector dim=100
+                 output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear
+
+ 'ref.config' : which is a version of the config file used to generate
+                a model for getting left and right context (it doesn't read
+                anything for the LDA-like transform and/or
+                presoftmax-prior-scale components)
+
+ 'final.config' : which has the actual config used to initialize the model used
+                 in training i.e, it has file paths for LDA transform and
+                 other initialization files
+"""
+
+
+__all__ = ["utils", "layers", "parser"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
new file mode 100644
index 00000000000..52f366b4cc2
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -0,0 +1,906 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+# Apache 2.0.
+
+""" This module contains the parent class from which all layers are inherited
+and some basic layer definitions.
+"""
+
+from __future__ import print_function
+import sys
+import libs.nnet3.xconfig.utils as xutils
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+class XconfigLayerBase(object):
+    """ A base-class for classes representing layers of xconfig files.
+    """
+
+    def __init__(self, first_token, key_to_value, all_layers):
+        """
+         first_token: first token on the xconfig line, e.g. 'affine-layer'.f
+         key_to_value: dictionary with parameter values
+             { 'name':'affine1',
+               'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))',
+               'dim=1024' }.
+             The only required and 'special' values that are dealt with directly
+             at this level, are 'name' and 'input'. The rest are put in
+             self.config and are dealt with by the child classes' init functions.
+         all_layers: An array of objects inheriting XconfigLayerBase for all
+                    previously parsed layers.
+        """
+
+        self.layer_type = first_token
+        if not 'name' in key_to_value:
+            raise xparser_error("Expected 'name' to be specified.", self.str())
+        self.name = key_to_value['name']
+        if not xutils.is_valid_line_name(self.name):
+            raise xparser_error("Invalid value: name={0}".format(
+                key_to_value['name']), self.str())
+
+        # the following, which should be overridden in the child class, sets
+        # default config parameters in self.config.
+        self.set_default_configs()
+        # The following is not to be reimplemented in child classes;
+        # it sets the config values to those specified by the user, and
+        # parses any Descriptors.
+        self.set_configs(key_to_value, all_layers)
+        # This method, sets the derived default config values
+        # i.e., config values when not specified can be derived from
+        # other values. It can be overridden in the child class.
+        self.set_derived_configs()
+        # the following, which should be overridden in the child class, checks
+        # that the config parameters that have been set are reasonable.
+        self.check_configs()
+
+
+    def set_configs(self, key_to_value, all_layers):
+        """ Sets the config variables.
+            We broke this code out of __init__ for clarity.
+            the child-class constructor will deal with the configuration values
+            in a more specific way.
+        """
+
+        for key,value in key_to_value.items():
+            if key != 'name':
+                if not key in self.config:
+                    raise xparser_error("Configuration value {0}={1} was not"
+                                        " expected in layer of type {2}"
+                                        "".format(key, value, self.layer_type),
+                                        self.str())
+                self.config[key] = xutils.convert_value_to_type(key,
+                                                                type(self.config[key]),
+                                                                value)
+        self.descriptors = dict()
+        self.descriptor_dims = dict()
+        # Parse Descriptors and get their dims and their 'final' string form.
+        # in self.descriptors[key]
+        for key in self.get_input_descriptor_names():
+            if not key in self.config:
+                raise xparser_error("{0}: object of type {1} needs to override"
+                                   " get_input_descriptor_names()."
+                                   "".format(sys.argv[0], str(type(self))),
+                                             self.str())
+            descriptor_string = self.config[key]  # input string.
+            assert isinstance(descriptor_string, str)
+            desc = self.convert_to_descriptor(descriptor_string, all_layers)
+            desc_dim = self.get_dim_for_descriptor(desc, all_layers)
+            desc_norm_str = desc.str()
+
+            # desc_output_str contains the "final" component names, those that
+            # appear in the actual config file (i.e. not names like
+            # 'layer.auxiliary_output'); that's how it differs from desc_norm_str.
+            # Note: it's possible that the two strings might be the same in
+            # many, even most, cases-- it depends whether
+            # output_name(self, auxiliary_output)
+            # returns self.get_name() + '.' + auxiliary_output
+            # when auxiliary_output is not None.
+            # That's up to the designer of the layer type.
+            desc_output_str = self.get_string_for_descriptor(desc, all_layers)
+            self.descriptors[key] = {'string':desc,
+                                     'normalized-string':desc_norm_str,
+                                     'final-string':desc_output_str,
+                                     'dim':desc_dim}
+
+            # the following helps to check the code by parsing it again.
+            desc2 = self.convert_to_descriptor(desc_norm_str, all_layers)
+            desc_norm_str2 = desc2.str()
+            # if the following ever fails we'll have to do some debugging.
+            if desc_norm_str != desc_norm_str2:
+                raise xparser_error("Likely code error: '{0}' != '{1}'"
+                                    "".format(desc_norm_str, desc_norm_str2),
+                                    self.str())
+
+    def str(self):
+        """Converts 'this' to a string which could be printed to
+        an xconfig file; in xconfig_to_configs.py we actually expand all the
+        lines to strings and write it as xconfig.expanded as a reference
+        (so users can see any defaults).
+        """
+
+        ans = '{0} name={1}'.format(self.layer_type, self.name)
+        ans += ' ' + ' '.join([ '{0}={1}'.format(key, self.config[key])
+                                for key in sorted(self.config.keys())])
+        return ans
+
+    def __str__(self):
+
+        return self.str()
+
+
+    def normalize_descriptors(self):
+        """Converts any config variables in self.config which correspond to
+        Descriptors, into a 'normalized form' derived from parsing them as
+        Descriptors, replacing things like [-1] with the actual layer names,
+        and regenerating them as strings.  We stored this when the object was
+        initialized, in self.descriptors; this function just copies them back
+        to the config.
+        """
+
+        for key, desc_str_dict in self.descriptors.items():
+            self.config[key] = desc_str_dict['normalized-string']
+
+    def convert_to_descriptor(self, descriptor_string, all_layers):
+        """Convenience function intended to be called from child classes,
+        converts a string representing a descriptor ('descriptor_string')
+        into an object of type Descriptor, and returns it. It needs 'self' and
+        'all_layers' (where 'all_layers' is a list of objects of type
+        XconfigLayerBase) so that it can work out a list of the names of other
+        layers, and get dimensions from them.
+        """
+
+        prev_names = xutils.get_prev_names(all_layers, self)
+        tokens = xutils.tokenize_descriptor(descriptor_string, prev_names)
+        pos = 0
+        (descriptor, pos) = xutils.parse_new_descriptor(tokens, pos, prev_names)
+        # note: 'pos' should point to the 'end of string' marker
+        # that terminates 'tokens'.
+        if pos != len(tokens) - 1:
+            raise xparser_error("Parsing Descriptor, saw junk at end: " +
+                            ' '.join(tokens[pos:-1]), self.str())
+        return descriptor
+
+    def get_dim_for_descriptor(self, descriptor, all_layers):
+        """Returns the dimension of a Descriptor object. This is a convenience
+        function used in set_configs.
+        """
+
+        layer_to_dim_func = \
+                lambda name: xutils.get_dim_from_layer_name(all_layers, self,
+                                                            name)
+        return descriptor.dim(layer_to_dim_func)
+
+    def get_string_for_descriptor(self, descriptor, all_layers):
+        """Returns the 'final' string form of a Descriptor object,
+        as could be used in config files. This is a convenience function
+        provided for use in child classes;
+        """
+
+        layer_to_string_func = \
+                lambda name: xutils.get_string_from_layer_name(all_layers,
+                                                               self, name)
+        return descriptor.config_string(layer_to_string_func)
+
+    def get_name(self):
+        """Returns the name of this layer, e.g. 'affine1'.  It does not
+        necessarily correspond to a component name.
+        """
+
+        return self.name
+
+    ######  Functions that might be overridden by the child class: #####
+
+    def set_default_configs(self):
+        """Child classes should override this.
+        """
+
+        raise Exception("Child classes must override set_default_configs().")
+
+    def set_derived_configs(self):
+        """This is expected to be called after set_configs and before
+        check_configs().
+        """
+
+        if self.config['dim'] <= 0:
+            self.config['dim'] = self.descriptors['input']['dim']
+
+    def check_configs(self):
+        """child classes should override this.
+        """
+
+        pass
+
+    def get_input_descriptor_names(self):
+        """This function, which may be (but usually will not have to be)
+        overridden by child classes, returns a list of names of the input
+        descriptors expected by this component. Typically this would just
+        return ['input'] as most layers just have one 'input'. However some
+        layers might require more inputs (e.g. cell state of previous LSTM layer
+        in Highway LSTMs). It is used in the function 'normalize_descriptors()'.
+        This implementation will work for layer types whose only
+        Descriptor-valued config is 'input'.
+        If a child class adds more inputs, or does not have an input
+        (e.g. the XconfigInputLayer), it should override this function's
+        implementation to something like: `return ['input', 'input2']`
+        """
+
+        return [ 'input' ]
+
+    def auxiliary_outputs(self):
+        """Returns a list of all auxiliary outputs that this layer supports.
+        These are either 'None' for the regular output, or a string
+        (e.g. 'projection' or 'memory_cell') for any auxiliary outputs that
+        the layer might provide.  Most layer types will not need to override
+        this.
+        """
+
+        return [ None ]
+
+    def output_name(self, auxiliary_output = None):
+        """Called with auxiliary_output == None, this returns the component-node
+        name of the principal output of the layer (or if you prefer, the text
+        form of a descriptor that gives you such an output; such as
+        Append(some_node, some_other_node)).
+        The 'auxiliary_output' argument is a text value that is designed for
+        extensions to layers that have additional auxiliary outputs.
+        For example, to implement a highway LSTM you need the memory-cell of a
+        layer, so you might allow auxiliary_output='memory_cell' for such a
+        layer type, and it would return the component node or a suitable
+        Descriptor: something like 'lstm3.c_t'
+        """
+
+        raise Exception("Child classes must override output_name()")
+
+    def output_dim(self, auxiliary_output = None):
+        """The dimension that this layer outputs.  The 'auxiliary_output'
+        parameter is for layer types which support auxiliary outputs.
+        """
+
+        raise Exception("Child classes must override output_dim()")
+
+    def get_full_config(self):
+        """This function returns lines destined for the 'full' config format, as
+        would be read by the C++ programs. Since the program
+        xconfig_to_configs.py writes several config files, this function returns
+        a list of pairs of the form (config_file_basename, line),
+        e.g. something like
+         [  ('init', 'input-node name=input dim=40'),
+            ('ref', 'input-node name=input dim=40') ]
+        which would be written to config_dir/init.config and config_dir/ref.config.
+        """
+
+        raise Exception("Child classes must override get_full_config()")
+
+
+class XconfigInputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'input name=input dim=40'
+    or
+    'input name=ivector dim=100'
+    in the config file.
+    """
+
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'input'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+
+    def set_default_configs(self):
+
+        self.config = { 'dim': -1}
+
+    def check_configs(self):
+
+        if self.config['dim'] <= 0:
+            raise xparser_error("Dimension of input-layer '{0}'"
+                                "should be positive.".format(self.name),
+                                self.str())
+
+    def get_input_descriptor_names(self):
+
+        return []  # there is no 'input' field in self.config.
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the input
+        assert auxiliary_outputs is None
+        return self.name
+
+    def output_dim(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the input
+        assert auxiliary_outputs is None
+        return self.config['dim']
+
+    def get_full_config(self):
+
+        # unlike other layers the input layers need to be printed in
+        # 'init.config' (which initializes the neural network prior to the LDA)
+        ans = []
+        for config_name in [ 'init', 'ref', 'final' ]:
+            ans.append( (config_name,
+                         'input-node name={0} dim={1}'.format(self.name,
+                                                              self.config['dim'])))
+        return ans
+
+
+
+class XconfigTrivialOutputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
+    This is for outputs that are not really output "layers"
+    (there is no affine transform or nonlinearity), they just directly map to an
+    output-node in nnet3.
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'output'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]' }
+
+    def check_configs(self):
+
+        pass  # nothing to check; descriptor-parsing can't happen in this function.
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the output
+        # of the previous layer
+        assert auxiliary_outputs is None
+        return self.name
+
+    def output_dim(self, auxiliary_outputs = None):
+
+        assert auxiliary_outputs is None
+        # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string).
+        return self.descriptors['input']['dim']
+
+    def get_full_config(self):
+
+        # the input layers need to be printed in 'init.config' (which
+        # initializes the neural network prior to the LDA), in 'ref.config',
+        # which is a version of the config file used for getting left and right
+        # context (it doesn't read anything for the LDA-like transform and/or
+        # presoftmax-prior-scale components)
+        # In 'full.config' we write everything, this is just for reference,
+        # and also for cases where we don't use the LDA-like transform.
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'output-string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_str = self.descriptors['input']['final-string']
+
+        for config_name in ['init', 'ref', 'final' ]:
+            ans.append( (config_name,
+                         'output-node name={0} input={1}'.format(
+                        self.name, descriptor_final_str)))
+        return ans
+
+
+class XconfigOutputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'output-layer name=output dim=4257 input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
+    By default this includes a log-softmax component.  The parameters are
+    initialized to zero, asthis is best for output layers.
+
+    Parameters of the class, and their defaults:
+        input='[-1]'    :   Descriptor giving the input of the layer.
+        dim=None    :   Output dimension of layer, will normally equal the number of pdfs.
+        include-log-softmax=true    :   setting it to false will omit the
+            log-softmax component- useful for chain models.
+        objective-type=linear   :   the only other choice currently is
+            'quadratic', for use in regression problems
+        learning-rate-factor=1.0    :   Learning rate factor for the final
+            affine component, multiplies the standard learning rate. normally
+            you'll leave this as-is, but for xent regularization output layers
+            for chain models you'll want to set
+            learning-rate-factor=(0.5/xent_regularize),
+            normally learning-rate-factor=5.0 since xent_regularize is
+            normally 0.1.
+        presoftmax-scale-file=None  :   If set, a filename for a vector that
+            will be used to scale the output of the affine component before the
+            log-softmax (if include-log-softmax=true), or before the output
+            (if not).  This is helpful to avoid instability in training due to
+            some classes having much more data than others.  The way we normally
+            create this vector is to take the priors of the classes to the
+            power -0.25 and rescale them so the average is 1.0.  This factor
+            -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
+            the scripts this would normally be set to
+            config_dir/presoftmax_prior_scale.vec
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'output-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = {'input' : '[-1]',
+                       'dim' : -1,
+                       'include-log-softmax' : True,
+                            # this would be false for chain models
+                       'objective-type' : 'linear',
+                            # see Nnet::ProcessOutputNodeConfigLine in
+                            # nnet-nnet.cc for other options
+                       'learning-rate-factor' : 1.0,
+                       'presoftmax-scale-file' : '',
+                            # used in DNN (not RNN) training when using
+                            # frame-level objfns,
+                       'max-change' : 1.5,
+                       'param-stddev' : 0.0,
+                       'bias-stddev' : 0.0,
+                       'output-delay' : 0
+                      }
+
+    def check_configs(self):
+
+        if self.config['dim'] <= -1:
+            raise xparser_error("In output-layer, dim has invalid value {0}"
+                                "".format(self.config['dim']), self.str())
+
+        if self.config['objective-type'] != 'linear' and \
+                self.config['objective_type'] != 'quadratic':
+            raise xparser_error("In output-layer, objective-type has"
+                                " invalid value {0}"
+                                "".format(self.config['objective-type']),
+                                self.str())
+
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise xparser_error("In output-layer, learning-rate-factor has"
+                                " invalid value {0}"
+                                "".format(self.config['learning-rate-factor']),
+                                self.str())
+
+
+    # you cannot access the output of this layer from other layers... see
+    # comment in output_name for the reason why.
+    def auxiliary_outputs(self):
+
+        return []
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # Note: nodes of type output-node in nnet3 may not be accessed in
+        # Descriptors, so calling this with auxiliary_outputs=None doesn't
+        # make sense.  But it might make sense to make the output of the softmax
+        # layer and/or the output of the affine layer available as inputs to
+        # other layers, in some circumstances.
+        # we'll implement that when it's needed.
+        raise xparser_error("Outputs of output-layer may not be used by other"
+                            " layers", self.str())
+
+    def output_dim(self, auxiliary_output = None):
+
+        # see comment in output_name().
+        raise xparser_error("Outputs of output-layer may not be used by other"
+                            " layers", self.str())
+
+    def get_full_config(self):
+
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+        objective_type = self.config['objective-type']
+        learning_rate_factor = self.config['learning-rate-factor']
+        include_log_softmax = self.config['include-log-softmax']
+        presoftmax_scale_file = self.config['presoftmax-scale-file']
+        param_stddev = self.config['param-stddev']
+        bias_stddev = self.config['bias-stddev']
+        output_delay = self.config['output-delay']
+        max_change = self.config['max-change']
+
+        # note: ref.config is used only for getting the left-context and
+        # right-context of the network;
+        # final.config is where we put the actual network definition.
+        for config_name in [ 'ref', 'final' ]:
+            # First the affine node.
+            line = ('component name={0}.affine'
+                    ' type=NaturalGradientAffineComponent'
+                    ' input-dim={1}'
+                    ' output-dim={2}'
+                    ' param-stddev={3}'
+                    ' bias-stddev={4}'
+                    ' max-change={5} '
+                    ''.format(self.name, input_dim, output_dim,
+                        param_stddev, bias_stddev, max_change) +
+                    ('learning-rate-factor={0} '.format(learning_rate_factor)
+                     if learning_rate_factor != 1.0 else ''))
+            ans.append((config_name, line))
+
+            line = ('component-node name={0}.affine'
+                    ' component={0}.affine input={1}'
+                    ''.format(self.name, descriptor_final_string))
+            ans.append((config_name, line))
+            cur_node = '{0}.affine'.format(self.name)
+
+            if presoftmax_scale_file is not '' and config_name == 'final':
+                # don't use the presoftmax-scale in 'ref.config' since that
+                # file won't exist at the time we evaluate it.
+                # (ref.config is used to find the left/right context).
+                line = ('component name={0}.fixed-scale'
+                        ' type=FixedScaleComponent scales={1}'
+                        ''.format(self.name, presoftmax_scale_file))
+                ans.append((config_name, line))
+
+                line = ('component-node name={0}.fixed-scale'
+                        ' component={0}.fixed-scale input={1}'
+                        ''.format(self.name, cur_node))
+                ans.append((config_name, line))
+                cur_node = '{0}.fixed-scale'.format(self.name)
+
+            if include_log_softmax:
+                line = ('component name={0}.log-softmax'
+                        ' type=LogSoftmaxComponent dim={1}'
+                        ''.format(self.name, output_dim))
+                ans.append((config_name, line))
+
+                line = ('component-node name={0}.log-softmax'
+                        ' component={0}.log-softmax input={1}'
+                        ''.format(self.name, cur_node))
+                ans.append((config_name, line))
+                cur_node = '{0}.log-softmax'.format(self.name)
+
+            if output_delay != 0:
+                cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
+
+            line = ('output-node name={0} input={1}'.format(self.name, cur_node))
+            ans.append((config_name, line))
+        return ans
+
+
+# This class is for parsing lines like
+#  'relu-renorm-layer name=layer1 dim=1024 input=Append(-3,0,3)'
+# or:
+#  'sigmoid-layer name=layer1 dim=1024 input=Append(-3,0,3)'
+# which specify addition of an affine component and a sequence of non-linearities.
+# Here, the name of the layer itself dictates the sequence of nonlinearities
+# that are applied after the affine component; the name should contain some
+# combination of 'relu', 'renorm', 'sigmoid' and 'tanh',
+# and these nonlinearities will be added along with the affine component.
+#
+# The dimension specified is the output dim; the input dim is worked out from the input descriptor.
+# This class supports only nonlinearity types that do not change the dimension; we can create
+# another layer type to enable the use p-norm and similar dimension-reducing nonlinearities.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer, e.g. 1024]
+#   self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
+#
+class XconfigBasicLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        # Here we just list some likely combinations.. you can just add any
+        # combinations you want to use, to this list.
+        assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer',
+                                'tanh-layer' ]
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim':-1,
+                        'max-change' : 0.75,
+                        'self-repair-scale' : 1.0e-05,
+                        'target-rms' : 1.0,
+                        'ng-affine-options' : ''}
+
+    def check_configs(self):
+        if self.config['dim'] < 0:
+            raise xparser_error("dim has invalid value {0}".format(self.config['dim']), self.str())
+        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
+            raise xparser_error("self-repair-scale has invalid value {0}".format(self.config['self-repair-scale']), self.str())
+        if self.config['target-rms'] < 0.0:
+            raise xparser_error("target-rms has invalid value {0}".format(self.config['target-rms']), self.str())
+
+    def output_name(self, auxiliary_output=None):
+        # at a later stage we might want to expose even the pre-nonlinearity
+        # vectors
+        assert auxiliary_output == None
+
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        last_nonlinearity = split_layer_name[-2]
+        # return something like: layer3.renorm
+        return '{0}.{1}'.format(self.name, last_nonlinearity)
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+        return output_dim
+
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+
+    def _generate_config(self):
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        nonlinearities = split_layer_name[:-1]
+
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+
+        # the child classes e.g. tdnn might want to process the input
+        # before adding the other components
+
+        return self._add_components(input_desc, input_dim, nonlinearities)
+
+    def _add_components(self, input_desc, input_dim, nonlinearities):
+        output_dim = self.output_dim()
+        self_repair_scale = self.config['self-repair-scale']
+        target_rms = self.config['target-rms']
+        max_change = self.config['max-change']
+        ng_opt_str = self.config['ng-affine-options']
+
+        configs = []
+        # First the affine node.
+        line = ('component name={0}.affine'
+                ' type=NaturalGradientAffineComponent'
+                ' input-dim={1}'
+                ' output-dim={2}'
+                ' max-change={3}'
+                ' {4}'
+                ''.format(self.name, input_dim, output_dim,
+                    max_change, ng_opt_str))
+        configs.append(line)
+
+        line = ('component-node name={0}.affine'
+                ' component={0}.affine input={1}'
+                ''.format(self.name, input_desc))
+        configs.append(line)
+        cur_node = '{0}.affine'.format(self.name)
+
+        for nonlinearity in nonlinearities:
+            if nonlinearity == 'relu':
+                line = ('component name={0}.{1}'
+                        ' type=RectifiedLinearComponent dim={2}'
+                        ' self-repair-scale={3}'
+                        ''.format(self.name, nonlinearity, output_dim,
+                            self_repair_scale))
+
+            elif nonlinearity == 'sigmoid':
+                line = ('component name={0}.{1}'
+                        ' type=SigmoidComponent dim={2}'
+                        ' self-repair-scale={3}'
+                        ''.format(self.name, nonlinearity, output_dim,
+                            self_repair_scale))
+
+            elif nonlinearity == 'tanh':
+                line = ('component name={0}.{1}'
+                        ' type=TanhComponent dim={2}'
+                        ' self-repair-scale={3}'
+                        ''.format(self.name, nonlinearity, output_dim,
+                            self_repair_scale))
+
+            elif nonlinearity == 'renorm':
+                line = ('component name={0}.{1}'
+                        ' type=NormalizeComponent dim={2}'
+                        ' target-rms={3}'
+                        ''.format(self.name, nonlinearity, output_dim,
+                            target_rms))
+
+            else:
+                raise xparser_error("Unknown nonlinearity type:"
+                        "{0}".format(nonlinearity), self.str())
+
+            configs.append(line)
+            line = ('component-node name={0}.{1}'
+                    ' component={0}.{1} input={2}'
+                    ''.format(self.name, nonlinearity, cur_node))
+
+            configs.append(line)
+            cur_node = '{0}.{1}'.format(self.name, nonlinearity)
+        return configs
+
+
+# This class is for lines like
+#  'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'
+#
+# The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.  Note: we don't attempt to read that
+# file at the time the config is created, because in the recipes, that file is created
+# after the config files.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
+#   affine-transform-file='' [Must be specified.]
+#
+class XconfigFixedAffineLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == 'fixed-affine-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim':-1,
+                        'affine-transform-file':''}
+
+    def check_configs(self):
+        if self.config['affine-transform-file'] is None:
+            raise xparser_error("affine-transform-file must be set.", self.str())
+
+    def output_name(self, auxiliary_output = None):
+        # Fixed affine layer computes only one vector, there are no intermediate
+        # vectors.
+        assert auxiliary_output == None
+        return self.name
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+        transform_file = self.config['affine-transform-file']
+
+
+        # to init.config we write an output-node with the name 'output' and
+        # with a Descriptor equal to the descriptor that's the input to this
+        # layer.  This will be used to accumulate stats to learn the LDA transform.
+        line = 'output-node name=output input={0}'.format(descriptor_final_string)
+        ans.append(('init', line))
+
+        # write the 'real' component to final.config
+        line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
+            self.name, transform_file)
+        ans.append(('final', line))
+        # write a random version of the component, with the same dims, to ref.config
+        line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format(
+            self.name, input_dim, output_dim)
+        ans.append(('ref', line))
+        # the component-node gets written to final.config and ref.config.
+        line = 'component-node name={0} component={0} input={1}'.format(
+            self.name, descriptor_final_string)
+        ans.append(('final', line))
+        ans.append(('ref', line))
+        return ans
+
+# This class is for lines like
+#  'affine-layer name=affine input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0))'
+#
+# The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.  Note: we don't attempt to read that
+# file at the time the config is created, because in the recipes, that file is created
+# after the config files.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
+#
+class XconfigAffineLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == 'affine-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        # use None for optional parameters as we want to default to the C++ defaults
+        # C++ component provides more options but I will just expose these for now
+        # Note : The type of the parameter is determined based on the value assigned
+        #        so please use decimal point if your parameter is a float
+        self.config = { 'input' : '[-1]',
+                        'dim' : -1,
+                        'param-stddev' : -1.0, # this has to be initialized to 1/sqrt(input_dim)
+                        'bias-stddev' : 1.0,
+                        'bias-mean' : 0.0,
+                        'max-change' : 0.75,
+                        'learning-rate-factor' : 1.0,
+                        'ng-affine-options' : ''}
+
+    def set_derived_configs(self):
+        super(XconfigAffineLayer, self).set_derived_configs()
+        if self.config['param-stddev'] < 0:
+            self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim']
+
+    def check_configs(self):
+        if self.config['dim'] <= 0:
+            raise xparser_error("dim specified is invalid".format(self.name, self.layer_type), self.str())
+
+    def output_name(self, auxiliary_output = None):
+        # affine layer computes only one vector, there are no intermediate
+        # vectors.
+        assert auxiliary_output == None
+        return self.name
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+
+        option_string=''
+        for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change']:
+            option_string += ' {0}={1}'.format(key, self.config[key])
+        option_string += self.config['ng-affine-options']
+
+        conf_lines = []
+        # write the 'real' component to final.config
+        conf_lines.append('component name={n} type=NaturalGradientAffineComponent '
+                          'input-dim={i} output-dim={o} {opts}'.format(n = self.name,
+                                                                       i = input_dim,
+                                                                       o = output_dim,
+                                                                       opts = option_string))
+        # the component-node gets written to final.config and ref.config.
+        conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name,
+                                                                                   descriptor_final_string))
+
+        # the config is same for both final and ref configs
+        for conf_name in ['final', 'ref']:
+            for line in conf_lines:
+                ans.append((conf_name, line))
+        return ans
+
+
+def test_layers():
+    # for some config lines that should be printed the same way as they
+    # are read, check that this is the case.
+    for x in [ 'input name=input dim=30' ]:
+        assert str(config_line_to_object(x, [])) == x
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
new file mode 100644
index 00000000000..353b9d3bba4
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -0,0 +1,8 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+from basic_layers import *
+from lstm import *
+from tdnn import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
new file mode 100644
index 00000000000..7b37958f81b
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -0,0 +1,532 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+
+""" This module has the implementations of different LSTM layers.
+"""
+import re
+
+from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+# This class is for lines like
+#   'lstm-layer name=lstm1 input=[-1] delay=-3'
+# It generates an LSTM sub-graph without output projections.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigLstmLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "lstm-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'clipping-threshold' : 30.0,
+                        'norm-based-clipping' : True,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+    def check_configs(self):
+        key = 'cell-dim'
+        if self.config['cell-dim'] <= 0:
+            raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str())
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'm_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', ng_per_element_scale_options) is None and \
+           re.search('param-stddev', ng_per_element_scale_options) is None:
+           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+        pes_str = ng_per_element_scale_options
+
+
+
+        configs = []
+
+        # the equations implemented here are
+        # TODO: write these
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Input gate control : W_i* matrices")
+        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Forget gate control : W_f* matrices")
+        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("#  Output gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Cell input matrices : W_c* matrices")
+        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+
+        configs.append("# i_t")
+        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        configs.append("# f_t")
+        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        configs.append("# h_t")
+        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        configs.append("# g_t")
+        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name))
+
+        return configs
+
+
+# This class is for lines like
+#   'lstmp-layer name=lstm1 input=[-1] delay=-3'
+# It generates an LSTM sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigLstmLayer for this
+# simple LSTM.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent_projection_dim [Dimension of the projection used in recurrent connections]
+#   non_recurrent_projection_dim        [Dimension of the projection in non-recurrent connections]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''              [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigLstmpLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        print first_token
+        assert first_token == "lstmp-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,
+                        'non-recurrent-projection-dim' : -1,
+                        'clipping-threshold' : 30.0,
+                        'norm-based-clipping' : True,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                       }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+        for key in ['recurrent-projection-dim', 'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                self.config[key] = self.config['cell-dim'] / 2
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim', 'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]), self.str())
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise xparser_error("{0} has invalid value {2}.".format(self.layer_type,
+                                                                               key,
+                                                                               self.config[key]))
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'rp_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        configs = []
+        # the equations implemented here are from Sak et. al. "Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
+        # http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Input gate control : W_i* matrices")
+        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Forget gate control : W_f* matrices")
+        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("#  Output gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Cell input matrices : W_c* matrices")
+        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+
+        recurrent_connection = '{0}.r_t'.format(name)
+        configs.append("# i_t")
+        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        configs.append("# f_t")
+        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        configs.append("# h_t")
+        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        configs.append("# g_t")
+        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("# projection matrices : Wrm and Wpm")
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+
+        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+
+        return configs
+
+# Same as the LSTMP layer except that the matrix multiplications are combined
+# we probably keep only version after experimentation. One year old experiments
+# show that this version is slightly worse and might require some tuning
+class XconfigLstmpcLayer(XconfigLstmpLayer):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "lstmpc-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', ng_per_element_scale_options) is None and \
+           re.search('param-stddev', ng_per_element_scale_options) is None:
+           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+        pes_str = ng_per_element_scale_options
+
+        configs = []
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Full W_ifoc* matrix")
+        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+
+        # we will not combine the diagonal matrix operations as one of these has a different delay
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+        rec_connection = '{0}.rp_t'.format(name)
+
+        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+
+
+        offset = 0
+        component_nodes.append("# i_t")
+        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        component_nodes.append("# f_t")
+        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        component_nodes.append("# o_t")
+        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        component_nodes.append("# h_t")
+        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        component_nodes.append("# g_t")
+        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("# projection matrices : Wrm and Wpm")
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, recurrent_projection_dim, bptrunc_str))
+
+        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
new file mode 100644
index 00000000000..4976084a977
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -0,0 +1,94 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+# Apache 2.0.
+
+""" This module contains the top level xconfig parsing functions.
+"""
+
+import libs.nnet3.xconfig.layers as xlayers
+import libs.nnet3.xconfig.utils as xutils
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+# We have to modify this dictionary when adding new layers
+config_to_layer = {
+        'input' : xlayers.XconfigInputLayer,
+        'output' : xlayers.XconfigTrivialOutputLayer,
+        'output-layer' : xlayers.XconfigOutputLayer,
+        'relu-layer' : xlayers.XconfigBasicLayer,
+        'relu-renorm-layer' : xlayers.XconfigBasicLayer,
+        'sigmoid-layer' : xlayers.XconfigBasicLayer,
+        'tanh-layer' : xlayers.XconfigBasicLayer,
+        'tdnn-relu-layer' : xlayers.XconfigTdnnLayer,
+        'tdnn-relu-renorm-layer' : xlayers.XconfigTdnnLayer,
+        'tdnn-sigmoid-layer' : xlayers.XconfigTdnnLayer,
+        'tdnn-tanh-layer' : xlayers.XconfigTdnnLayer,
+        'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
+        'affine-layer' : xlayers.XconfigAffineLayer,
+        'lstm-layer' : xlayers.XconfigLstmLayer,
+        'lstmp-layer' : xlayers.XconfigLstmpLayer,
+        'lstmpc-layer' : xlayers.XconfigLstmpcLayer
+        }
+
+# Converts a line as parsed by ParseConfigLine() into a first
+# token e.g. 'input-layer' and a key->value map, into
+# an objet inherited from XconfigLayerBase.
+# 'prev_names' is a list of previous layer names, it's needed
+# to parse things like '[-1]' (meaning: the previous layer)
+# when they appear in Desriptors.
+def parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names):
+
+    conf_line = first_token + ' ' + ' '.join(['{0}={1}'.format(x,y) for x,y in key_to_value.items()])
+
+    if not config_to_layer.has_key(first_token):
+        raise xparser_error("No such layer type.", conf_line)
+
+    try:
+        return config_to_layer[first_token](first_token, key_to_value, prev_names)
+    except xparser_error as e:
+        if e.conf_line is None:
+            # we want to throw informative errors which point to the xconfig line
+            e.conf_line = conf_line
+        raise
+
+# Uses ParseConfigLine() to turn a config line that has been parsed into
+# a first token e.g. 'affine-layer' and a key->value map like { 'dim':'1024', 'name':'affine1' },
+# and then turns this into an object representing that line of the config file.
+# 'prev_names' is a list of the names of preceding lines of the
+# config file.
+def config_line_to_object(config_line, prev_names = None):
+    (first_token, key_to_value) = xutils.parse_config_line(config_line)
+    return parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names)
+
+# This function reads an xconfig file and returns it as a list of layers
+# (usually we use the variable name 'all_layers' elsewhere for this).
+# It will die if the xconfig file is empty or if there was
+# some error parsing it.
+def read_xconfig_file(xconfig_filename):
+    try:
+        f = open(xconfig_filename, 'r')
+    except Exception as e:
+        sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format(
+            sys.argv[0], xconfig_filename, repr(e)))
+    all_layers = []
+    while True:
+        line = f.readline()
+        if line == '':
+            break
+        x = xutils.parse_config_line(line)
+        if x is None:
+            continue   # line was blank or only comments.
+        (first_token, key_to_value) = x
+        # the next call will raise an easy-to-understand exception if
+        # it fails.
+        this_layer = parsed_line_to_xconfig_layer(first_token,
+                                                  key_to_value,
+                                                  all_layers)
+        all_layers.append(this_layer)
+    if len(all_layers) == 0:
+        raise xparser_error("{0}: xconfig file '{1}' is empty".format(
+            sys.argv[0], xconfig_filename))
+    f.close()
+    return all_layers
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
new file mode 100644
index 00000000000..21f9db4f5c8
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
@@ -0,0 +1,110 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+# Apache 2.0.
+
+
+""" This module contains the implementation of the TDNN layer.
+"""
+
+import libs.nnet3.xconfig.utils as xutils
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+from libs.nnet3.xconfig.basic_layers import XconfigBasicLayer
+from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
+
+class XconfigTdnnLayer(XconfigBasicLayer):
+    """This class is for parsing lines like
+    tdnn-relu-renorm-layer name=tdnn1 dim=1024 splice-indexes=-3,0,3 subset-dim=512
+
+    It is similar to XconfigBasicLayer except for the way in which the input
+    splicing is done. So we derive this class from XconfigBasicLayer.
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token in [ 'tdnn-relu-layer', 'tdnn-relu-renorm-layer',
+                                'tdnn-sigmoid-layer', 'tdnn-tanh-layer' ]
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+
+    def set_default_configs(self):
+
+        super(XconfigTdnnLayer, self).set_default_configs()
+
+        self.config['splice-indexes'] = ''
+        self.config['subset-dim'] = -1
+
+    def check_configs(self):
+
+        if self.config['splice-indexes'] == '':
+            raise xparser_error("splice-indexes has to be non-empty", self.str())
+        super(XconfigTdnnLayer, self).check_configs()
+
+
+    def _generate_config(self):
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        # ignore the first 'tdnn' and the last 'layer'
+        nonlinearities = split_layer_name[1:-1]
+
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        splice_indexes = self.get_splice_indexes()
+        input_desc, input_dim, sp_configs = self.splice_input(input_desc,
+                input_dim, splice_indexes, self.config['subset-dim'],
+                '{0}.input-subset'.format(self.name))
+
+        return sp_configs + self._add_components(input_desc, input_dim, nonlinearities)
+
+    def get_splice_indexes(self):
+        try:
+            return map(lambda x: int(x), self.config['splice-indexes'].split(","))
+        except ValueError:
+            raise xparser_error("Invalid value for splice-indexes.", str(self))
+
+    @staticmethod
+    def splice_input(input_desc, input_dim,
+                     splice_indexes, subset_dim = -1,
+                     dim_range_node_name = None ):
+        """Convenience function to create an appended descriptor with the
+        splice_indexes.
+        """
+
+        configs = []
+        try:
+            zero_index = splice_indexes.index(0)
+        except ValueError:
+            zero_index = None
+
+        if subset_dim > 0:
+            assert(dim_range_node_name is not None)
+            # if subset_dim is specified the script expects a zero
+            # in the splice indexes
+            assert(zero_index is not None)
+            line = ("dim-range-node name={0}"
+                    " input-node={1}"
+                    " dim-offset={2}"
+                    " dim={3}"
+                    "".format(dim_range_node_name,
+                              input_desc, 0, subset_dim))
+            configs.append(line)
+            subset_desc = dim_range_node_name
+
+        else:
+            subset_desc = input_desc
+            subset_dim = input_dim
+
+        appended_descriptors = []
+        appended_dimension = 0
+        for j in range(len(splice_indexes)):
+            if j == zero_index:
+                appended_descriptors.append(input_desc)
+                appended_dimension += input_dim
+                continue
+            appended_descriptors.append('Offset({0}, {1})'.format(subset_desc, splice_indexes[j]))
+            appended_dimension += subset_dim
+        return ["Append({0})".format(", ".join(appended_descriptors)),
+                appended_dimension,
+                configs]
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
new file mode 100644
index 00000000000..87c9d880089
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
@@ -0,0 +1,615 @@
+# Copyright  2016  Johns Hopkins University (Author: Daniel Povey).
+# License: Apache 2.0.
+
+# This library contains various utilities that are involved in processing
+# of xconfig -> config conversion.  It contains "generic" lower-level code
+# while xconfig_layers.py contains the code specific to layer types.
+
+from __future__ import print_function
+import re
+import sys
+
+
+class XconfigParserError(RuntimeError):
+    def __init__(self, error_msg, conf_line=None):
+        self.conf_line = conf_line
+        if conf_line is not None:
+            self.msg = 'While parsing "{c}" :{e}'.format(c=conf_line, e=error_msg)
+        else:
+            self.msg = error_msg
+
+    def __str__(self):
+        return self.msg
+
+# [utility function used in xconfig_layers.py]
+# Given a list of objects of type XconfigLayerBase ('all_layers'),
+# including at least the layers preceding 'current_layer' (and maybe
+# more layers), return the names of layers preceding 'current_layer'
+# This will be used in parsing expressions like [-1] in descriptors
+# (which is an alias for the previous layer).
+def get_prev_names(all_layers, current_layer):
+    prev_names = []
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        prev_names.append(layer.get_name())
+    prev_names_set = set()
+    for name in prev_names:
+        if name in prev_names_set:
+            raise XconfigParserError("{0}: Layer name {1} is used more than once.".format(
+                    sys.argv[0], name), current_layer.str())
+        prev_names_set.add(name)
+    return prev_names
+
+
+# This is a convenience function to parser the auxiliary output name from the
+# full layer name
+
+def split_layer_name(full_layer_name):
+    assert isinstance(full_layer_name, str)
+    split_name = full_layer_name.split('.')
+    if len(split_name) == 0:
+        raise XconfigParserError("Bad layer name: " + full_layer_name)
+    layer_name = split_name[0]
+    if len(split_name) == 1:
+        auxiliary_output = None
+    else:
+        # we probably expect len(split_name) == 2 in this case,
+        # but no harm in allowing dots in the auxiliary_output.
+        auxiliary_output = '.'.join(split_name[1:])
+
+    return [layer_name, auxiliary_output]
+
+# [utility function used in xconfig_layers.py]
+# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
+# 'lstm2.memory_cell', into a dimension.  'all_layers' is a vector of objects
+# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
+# function can make sure not to look in layers that appear *after* this layer
+# (because that's not allowed).
+def get_dim_from_layer_name(all_layers, current_layer, full_layer_name):
+    layer_name, auxiliary_output = split_layer_name(full_layer_name)
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        if layer.get_name() == layer_name:
+            if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
+                raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output), layer.str())
+            return layer.output_dim(auxiliary_output)
+    # No such layer was found.
+    if layer_name in [ layer.get_name() for layer in all_layers ]:
+        raise XconfigParserError("Layer '{0}' was requested before it appeared in "
+                        "the xconfig file (circular dependencies or out-of-order "
+                        "layers".format(layer_name))
+    else:
+        raise XconfigParserError("No such layer: '{0}'".format(layer_name))
+
+
+# [utility function used in xconfig_layers.py]
+# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
+# 'lstm2.memory_cell', into a descriptor (usually, but not required to be a simple
+# component-node name) that can appear in the generated config file.  'all_layers' is a vector of objects
+# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
+# function can make sure not to look in layers that appear *after* this layer
+# (because that's not allowed).
+def get_string_from_layer_name(all_layers, current_layer, full_layer_name):
+    layer_name, auxiliary_output = split_layer_name(full_layer_name)
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        if layer.get_name() == layer_name:
+            if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
+                raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(
+                    layer_name, auxiliary_output))
+            return layer.output_name(auxiliary_output)
+    # No such layer was found.
+    if layer_name in [ layer.get_name() for layer in all_layers ]:
+        raise XconfigParserError("Layer '{0}' was requested before it appeared in "
+                        "the xconfig file (circular dependencies or out-of-order "
+                        "layers".format(layer_name))
+    else:
+        raise XconfigParserError("No such layer: '{0}'".format(layer_name))
+
+
+# This function, used in converting string values in config lines to
+# configuration values in self.config in layers, attempts to
+# convert 'string_value' to an instance dest_type (which is of type Type)
+# 'key' is only needed for printing errors.
+def convert_value_to_type(key, dest_type, string_value):
+    if dest_type == type(bool()):
+        if string_value == "True" or string_value == "true":
+            return True
+        elif string_value == "False" or string_value == "false":
+            return False
+        else:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected bool)".format(
+                key, string_value))
+    elif dest_type == type(int()):
+        try:
+            return int(string_value)
+        except:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format(
+                key, string_value))
+    elif dest_type == type(float()):
+        try:
+            return float(string_value)
+        except:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format(
+                key, string_value))
+    elif dest_type == type(str()):
+        return string_value
+
+
+
+# This class parses and stores a Descriptor-- expression
+# like Append(Offset(input, -3), input) and so on.
+# For the full range of possible expressions, see the comment at the
+# top of src/nnet3/nnet-descriptor.h.
+# Note: as an extension to the descriptor format used in the C++
+# code, we can have e.g. input@-3 meaning Offset(input, -3);
+# and if bare integer numbers appear where a descriptor was expected,
+# they are interpreted as Offset(prev_layer, -3) where 'prev_layer'
+# is the previous layer in the config file.
+
+# Also, in any place a raw input/layer/output name can appear, we accept things
+# like [-1] meaning the previous input/layer/output's name, or [-2] meaning the
+# last-but-one input/layer/output, and so on.
+class Descriptor:
+    def __init__(self,
+                 descriptor_string = None,
+                 prev_names = None):
+        # self.operator is a string that may be 'Offset', 'Append',
+        # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round',
+        # 'ReplaceIndex'; it also may be None, representing the base-case
+        # (where it's just a layer name)
+
+        # self.items will be whatever items are
+        # inside the parentheses, e.g. if this is Sum(foo bar),
+        # then items will be [d1, d2], where d1 is a Descriptor for
+        # 'foo' and d1 is a Descriptor for 'bar'.  However, there are
+        # cases where elements of self.items are strings or integers,
+        # for instance in an expression 'ReplaceIndex(ivector, x, 0)',
+        # self.items would be [d, 'x', 0], where d is a Descriptor
+        # for 'ivector'.  In the case where self.operator is None (where
+        # this Descriptor represents just a bare layer name), self.
+        # items contains the name of the input layer as a string.
+        self.operator = None
+        self.items = None
+
+        if descriptor_string != None:
+            try:
+                tokens = tokenize_descriptor(descriptor_string, prev_names)
+                pos = 0
+                (d, pos) = parse_new_descriptor(tokens, pos, prev_names)
+                # note: 'pos' should point to the 'end of string' marker
+                # that terminates 'tokens'.
+                if pos != len(tokens) - 1:
+                    raise XconfigParserError("Parsing Descriptor, saw junk at end: " +
+                                    ' '.join(tokens[pos:-1]))
+                # copy members from d.
+                self.operator = d.operator
+                self.items = d.items
+            except XconfigParserError as e:
+                traceback.print_tb(sys.exc_info()[2])
+                raise XconfigParserError("Error parsing Descriptor '{0}', specific error was: {1}".format(
+                    descriptor_string, repr(e)))
+
+    # This is like the str() function, but it uses the layer_to_string function
+    # (which is a function from strings to strings) to convert layer names (or
+    # in general sub-layer names of the form 'foo.bar') to the component-node
+    # (or, in general, descriptor) names that appear in the final config file.
+    # This mechanism gives those designing layer types the freedom to name their
+    # nodes as they want.
+    def config_string(self, layer_to_string):
+        if self.operator is None:
+            assert len(self.items) == 1 and isinstance(self.items[0], str)
+            return layer_to_string(self.items[0])
+        else:
+            assert isinstance(self.operator, str)
+            return self.operator + '(' + ', '.join(
+                    [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item)
+                      for item in self.items]) + ')'
+
+    def str(self):
+        if self.operator is None:
+            assert len(self.items) == 1 and isinstance(self.items[0], str)
+            return self.items[0]
+        else:
+            assert isinstance(self.operator, str)
+            return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')'
+
+    def __str__(self):
+        return self.str()
+
+    # This function returns the dimension (i.e. the feature dimension) of the
+    # descriptor.  It takes 'layer_to_dim' which is a function from
+    # layer-names (including sub-layer names, like lstm1.memory_cell) to
+    # dimensions, e.g. you might have layer_to_dim('ivector') = 100, or
+    # layer_to_dim('affine1') = 1024.
+    # note: layer_to_dim will raise an exception if a nonexistent layer or
+    # sub-layer is requested.
+    def dim(self, layer_to_dim):
+        if self.operator is None:
+            # base-case: self.items = [ layer_name ] (or sub-layer name, like
+            # 'lstm.memory_cell').
+            return layer_to_dim(self.items[0])
+        elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]:
+            # these are all operators for which all args are descriptors
+            # and must have the same dim.
+            dim = self.items[0].dim(layer_to_dim)
+            for desc in self.items[1:]:
+                next_dim = desc.dim(layer_to_dim)
+                if next_dim != dim:
+                    raise XparserError("In descriptor {0}, different fields have different "
+                                        "dimensions: {1} != {2}".format(self.str(), dim, next_dim))
+            return dim
+        elif self.operator in [  'Offset', 'Round', 'ReplaceIndex' ]:
+            # for these operators, only the 1st arg is relevant.
+            return self.items[0].dim(layer_to_dim)
+        elif self.operator == 'Append':
+            return sum([ x.dim(layer_to_dim) for x in self.items])
+        else:
+            raise XconfigParserError("Unknown operator {0}".format(self.operator))
+
+
+
+# This just checks that seen_item == expected_item, and raises an
+# exception if not.
+def expect_token(expected_item, seen_item, what_parsing):
+    if seen_item != expected_item:
+        raise XconfigParserError("parsing {0}, expected '{1}' but got '{2}'".format(
+            what_parsing, expected_item, seen_item))
+
+# returns true if 'name' is valid as the name of a line (input, layer or output);
+# this is the same as IsValidname() in the nnet3 code.
+def is_valid_line_name(name):
+    return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None
+
+# This function for parsing Descriptors takes an array of tokens as produced
+# by tokenize_descriptor.  It parses a descriptor
+# starting from position pos >= 0 of the array 'tokens', and
+# returns a new position in the array that reflects any tokens consumed while
+# parsing the descriptor.
+# It returns a pair (d, pos) where d is the newly parsed Descriptor,
+# and 'pos' is the new position after consuming the relevant input.
+# 'prev_names' is so that we can find the most recent layer name for
+# expressions like Append(-3, 0, 3) which is shorthand for the most recent
+# layer spliced at those time offsets.
+def parse_new_descriptor(tokens, pos, prev_names):
+    size = len(tokens)
+    first_token = tokens[pos]
+    pos += 1
+    d = Descriptor()
+
+    # when reading this function, be careful to note the indent level,
+    # there is an if-statement within an if-statement.
+    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+        expect_token('(', tokens[pos], first_token + '()')
+        pos += 1
+        d.operator = first_token
+        # the 1st argument of all these operators is a Descriptor.
+        (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+        d.items = [desc]
+
+        if first_token == 'Offset':
+            expect_token(',', tokens[pos], 'Offset()')
+            pos += 1
+            try:
+                t_offset = int(tokens[pos])
+                pos += 1
+                d.items.append(t_offset)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            if tokens[pos] == ')':
+                return (d, pos + 1)
+            elif tokens[pos] != ',':
+                raise XconfigParserError("Parsing Offset(), expected ')' or ',', got " + tokens[pos])
+            pos += 1
+            try:
+                x_offset = int(tokens[pos])
+                pos += 1
+                d.items.append(x_offset)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'Offset()')
+            pos += 1
+        elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+            while True:
+                if tokens[pos] == ')':
+                    # check num-items is correct for some special cases.
+                    if first_token == 'Failover' and len(d.items) != 2:
+                        raise XconfigParserError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items)))
+                    if first_token == 'IfDefined' and len(d.items) != 1:
+                        raise XconfigParserError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items)))
+                    pos += 1
+                    break
+                elif tokens[pos] == ',':
+                    pos += 1  # consume the comma.
+                else:
+                    raise XconfigParserError("Parsing Append(), expected ')' or ',', got " + tokens[pos])
+
+                (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+                d.items.append(desc)
+        elif first_token == 'Round':
+            expect_token(',', tokens[pos], 'Round()')
+            pos += 1
+            try:
+                t_modulus = int(tokens[pos])
+                assert t_modulus > 0
+                pos += 1
+                d.items.append(t_modulus)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'Round()')
+            pos += 1
+        elif first_token == 'ReplaceIndex':
+            expect_token(',', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+            if tokens[pos] in [ 'x', 't' ]:
+                d.items.append(tokens[pos])
+                pos += 1
+            else:
+                raise XconfigParserError("Parsing ReplaceIndex(), expected 'x' or 't', got " +
+                                tokens[pos])
+            expect_token(',', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+            try:
+                new_value = int(tokens[pos])
+                pos += 1
+                d.items.append(new_value)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+        else:
+            raise XconfigParserError("code error")
+    elif first_token in [ 'end of string', '(', ')', ',', '@' ]:
+        raise XconfigParserError("Expected descriptor, got " + first_token)
+    elif is_valid_line_name(first_token) or first_token == '[':
+        # This section parses a raw input/layer/output name, e.g. "affine2"
+        # (which must start with an alphabetic character or underscore),
+        # optionally followed by an offset like '@-3'.
+
+        d.operator = None
+        d.items = [first_token]
+
+        # If the layer-name o is followed by '@', then
+        # we're parsing something like 'affine1@-3' which
+        # is syntactic sugar for 'Offset(affine1, 3)'.
+        if tokens[pos] == '@':
+            pos += 1
+            try:
+                offset_t = int(tokens[pos])
+                pos += 1
+            except:
+                raise XconfigParserError("Parse error parsing {0}@{1}".format(
+                    first_token, tokens[pos]))
+            if offset_t != 0:
+                inner_d = d
+                d = Descriptor()
+                # e.g. foo@3 is equivalent to 'Offset(foo, 3)'.
+                d.operator = 'Offset'
+                d.items = [ inner_d, offset_t ]
+    else:
+        # the last possible case is that 'first_token' is just an integer i,
+        # which can appear in things like Append(-3, 0, 3).
+        # See if the token is an integer.
+        # In this case, it's interpreted as the name of previous layer
+        # (with that time offset applied).
+        try:
+            offset_t = int(first_token)
+        except:
+            raise XconfigParserError("Parsing descriptor, expected descriptor but got " +
+                            first_token)
+        assert isinstance(prev_names, list)
+        if len(prev_names) < 1:
+            raise XconfigParserError("Parsing descriptor, could not interpret '{0}' because "
+                            "there is no previous layer".format(first_token))
+        d.operator = None
+        # the layer name is the name of the most recent layer.
+        d.items = [prev_names[-1]]
+        if offset_t != 0:
+            inner_d = d
+            d = Descriptor()
+            d.operator = 'Offset'
+            d.items = [ inner_d, offset_t ]
+    return (d, pos)
+
+
+# This function takes a string 'descriptor_string' which might
+# look like 'Append([-1], [-2], input)', and a list of previous layer
+# names like prev_names = ['foo', 'bar', 'baz'], and replaces
+# the integers in brackets with the previous layers.  -1 means
+# the most recent previous layer ('baz' in this case), -2
+# means the last layer but one ('bar' in this case), and so on.
+# It will throw an exception if the number is out of range.
+# If there are no such expressions in the string, it's OK if
+# prev_names == None (this is useful for testing).
+def replace_bracket_expressions_in_descriptor(descriptor_string,
+                                         prev_names = None):
+    fields = re.split(r'(\[|\])\s*', descriptor_string)
+    out_fields = []
+    i = 0
+    while i < len(fields):
+        f = fields[i]
+        i += 1
+        if f == ']':
+            raise XconfigParserError("Unmatched ']' in descriptor")
+        elif f == '[':
+            if i + 2 >= len(fields):
+                raise XconfigParserError("Error tokenizing string '{0}': '[' found too close "
+                                "to the end of the descriptor.".format(descriptor_string))
+            assert isinstance(prev_names, list)
+            try:
+                offset = int(fields[i])
+                assert offset < 0 and -offset <= len(prev_names)
+                i += 2  # consume the int and the ']'.
+            except:
+                raise XconfigParserError("Error tokenizing string '{0}': expression [{1}] has an "
+                                "invalid or out of range offset.".format(descriptor_string, fields[i]))
+            this_field = prev_names[offset]
+            out_fields.append(this_field)
+        else:
+            out_fields.append(f)
+    return ''.join(out_fields)
+
+# tokenizes 'descriptor_string' into the tokens that may be part of Descriptors.
+# Note: for convenience in parsing, we add the token 'end-of-string' to this
+# list.
+# The argument 'prev_names' (for the names of previous layers and input and
+# output nodes) is needed to process expressions like [-1] meaning the most
+# recent layer, or [-2] meaning the last layer but one.
+# The default None for prev_names is only supplied for testing purposes.
+def tokenize_descriptor(descriptor_string,
+                       prev_names = None):
+    # split on '(', ')', ',', '@', and space.  Note: the parenthesis () in the
+    # regexp causes it to output the stuff inside the () as if it were a field,
+    # which is how the call to re.split() keeps characters like '(' and ')' as
+    # tokens.
+    fields = re.split(r'(\(|\)|@|,|\s)\s*',
+                      replace_bracket_expressions_in_descriptor(descriptor_string,
+                                                            prev_names))
+    ans = []
+    for f in fields:
+        # don't include fields that are space, or are empty.
+        if re.match(r'^\s*$', f) is None:
+            ans.append(f)
+
+    ans.append('end of string')
+    return ans
+
+
+# This function parses a line in a config file, something like
+# affine-layer name=affine1 input=Append(-3, 0, 3)
+# and returns a pair,
+# (first_token, fields), as (string, dict) e.g. in this case
+# ('affine-layer', {'name':'affine1', 'input':'Append(-3, 0, 3)"
+# Note: spaces are allowed in the field names but = signs are
+# disallowed, which is why it's possible to parse them.
+# This function also removes comments (anything after '#').
+# As a special case, this function will return None if the line
+# is empty after removing spaces.
+def parse_config_line(orig_config_line):
+    # Remove comments.
+    # note: splitting on '#' will always give at least one field...  python
+    # treats splitting on space as a special case that may give zero fields.
+    config_line = orig_config_line.split('#')[0]
+    if re.match('[^a-zA-Z0-9\.\-\(\)_\s"]', config_line) is not None:
+        raise XconfigParserError("Xconfig line has unknown characters.", config_line)
+
+    # Now split on space; later we may splice things back together.
+    fields=config_line.split()
+    if len(fields) == 0:
+        return None   # Line was only whitespace after removing comments.
+    first_token = fields[0]
+    # if first_token does not look like 'foo-bar' or 'foo-bar2', then die.
+    if re.match('^[a-z][-a-z0-9]+$', first_token) is None:
+        raise XconfigParserError("Error parsing config line (first field doesn't look right): {0}".format(
+            orig_config_line))
+    # get rid of the first field which we put in 'first_token'.
+    fields = fields[1:]
+
+    rest_of_line = ' '.join(fields)
+    # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
+    positions = map(lambda x: x.start(), re.finditer('"', rest_of_line))
+    if not len(positions) % 2 == 0:
+        raise XconfigParserError('"s should occur in pairs', config_line)
+
+    # add the " enclosed strings and corresponding keys to the dict
+    # and remove them from the rest_of_line
+    num_strings = len(positions) / 2
+    fields = []
+    for i in range(num_strings):
+        start = positions[i * 2]
+        end = positions[i * 2 + 1]
+        rest_of_line_after = rest_of_line[end + 1:]
+        parts = rest_of_line[:start].split()
+        rest_of_line_before = ' '.join(parts[:-1])
+        assert(parts[-1][-1] == '=')
+        fields.append(parts[-1][:-1])
+        fields.append(rest_of_line[start + 1 : end])
+        rest_of_line = rest_of_line_before + ' ' + rest_of_line_after
+
+    # suppose rest_of_line is: 'input=Append(foo, bar) foo=bar'
+    # then after the below we'll get
+    # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar']
+    ans_dict = dict()
+    other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line)
+    if not (other_fields[0] == '' and len(other_fields) % 2 ==  1):
+        raise XconfigParserError("Could not parse config line: " + orig_config_line)
+    fields += other_fields[1:]
+    num_variables = len(fields) / 2
+    for i in range(num_variables):
+        var_name = fields[i * 2]
+        var_value = fields[i * 2 + 1]
+        if re.match(r'[a-zA-Z_]', var_name) is None:
+            raise XconfigParserError("Expected variable name '{0}' to start with alphabetic character or _, "
+                            "in config line {1}".format(var_name, orig_config_line))
+        if var_name in ans_dict:
+            raise XconfigParserError("Config line has multiply defined variable {0}: {1}".format(
+                var_name, orig_config_line))
+        ans_dict[var_name] = var_value
+    return (first_token, ans_dict)
+
+# Reads a config file and returns a list of objects, where each object
+# represents one line of the file.
+def read_config_file(filename):
+    try:
+        f = open(filename, "r")
+    except XconfigParserError as e:
+        raise XconfigParserError("Error reading config file {0}: {1}".format(
+            filename, repr(e)))
+    ans = []
+    prev_names = []
+    while True:
+        line = f.readline()
+        if line == '':
+            break
+        x = parse_config_line(line)
+        if x is None:
+            continue  # blank line
+        (first_token, key_to_value) = x
+        layer_object = config_line_to_object(first_token, key_to_value, prev_names)
+        ans.append(layer_object)
+        prev_names.append(layer_object.get_name())
+
+def test_library():
+    tokenize_test = lambda x: tokenize_descriptor(x)[:-1]  # remove 'end of string'
+    assert tokenize_test("hi") == ['hi']
+    assert tokenize_test("hi there") == ['hi', 'there']
+    assert tokenize_test("hi,there") == ['hi', ',', 'there']
+    assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there']
+    assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')']
+    assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ]
+    assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ]
+
+    assert Descriptor('foo').str() == 'foo'
+    assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)'
+    assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))'
+    for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))',
+               'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))',
+               'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]:
+        if not Descriptor(x).str() == x:
+            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x))
+
+    prev_names = ['last_but_one_layer', 'prev_layer']
+    for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'),
+                  ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'),
+                  ('Append(input@-3, input@0, input@3)',
+                   'Append(Offset(input, -3), input, Offset(input, 3))'),
+                  ('Append(-3,0,3)',
+                   'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'),
+                  ('[-1]', 'prev_layer'),
+                  ('[-2]', 'last_but_one_layer'),
+                  ('[-2]@3',
+                   'Offset(last_but_one_layer, 3)') ]:
+        if not Descriptor(x, prev_names).str() == y:
+            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y))
+
+
+    print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar'))
+    print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar opt2="a=1 b=2"'))
+    print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar'))
+    print(parse_config_line('affine-layer'))
+
+if __name__ == "__main__":
+    test_library()
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
index fdd7a02fd88..b27cd9eff1c 100755
--- a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
@@ -2,6 +2,9 @@
 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 
+# This script was modified around 11.11.2016, when the code was extended to
+# support having a different pdf-class on the self loop.
+
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.  This is a modified version of
 # 'utils/gen_topo.pl' that generates a different type of topology, one that we
@@ -41,9 +44,8 @@
 # We make the transition-probs 0.5 so they normalize, to keep the code happy.
 # In fact, we always set the transition probability scale to 0.0 in the 'chain'
 # code, so they are never used.
-print("<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
-print("<State> 2 </State>")
+print("<State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
+print("<State> 1 </State>")
 print("</TopologyEntry>")
 print("</Topology>")
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py
new file mode 100755
index 00000000000..01a715a9a23
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo_orig.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# This file is as ./gen_topo.py used to be (before we extended the transition-model
+# code to support having a different self-loop pdf-class).  It is included
+# here for baseline and testing purposes.
+
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+# The next two lines may look like a bug, but they are as intended.  State 0 has
+# no self-loop, it happens exactly once.  And it can go either to state 1 (with
+# a self-loop) or to state 2, so we can have zero or more instances of state 1
+# following state 0.
+# We make the transition-probs 0.5 so they normalize, to keep the code happy.
+# In fact, we always set the transition probability scale to 0.0 in the 'chain'
+# code, so they are never used.
+print("<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 2 </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
index f012d06cca9..d58db33bf98 100644
--- a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
+++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
@@ -169,7 +169,8 @@ def PrepareInitialAcousticModel(dir, run_opts):
                    command = run_opts.command, dir = dir))
 
 def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
-                  egs_dir, leaky_hmm_coefficient, l2_regularize,
+                  egs_dir, left_context, right_context,
+                  leaky_hmm_coefficient, l2_regularize,
                   xent_regularize, run_opts):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
@@ -188,10 +189,13 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
 nnet3-chain-combine --num-iters=40 \
    --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
+   --verbose=3 {dir}/den.fst {raw_models} \
+   "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \
+        nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:-|" \
 "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
     """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                dir = dir, raw_models = " ".join(raw_model_strings),
                num_chunk_per_minibatch = num_chunk_per_minibatch,
@@ -201,9 +205,20 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = 'final',
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                l2_regularize = l2_regularize,
+                                xent_regularize = xent_regularize,
+                                leaky_hmm_coefficient = leaky_hmm_coefficient,
+                                run_opts = run_opts,
+                                wait = False)
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
+def ComputeTrainCvProbabilities(dir, iter,
+                                egs_dir, left_context, right_context,
+                                l2_regularize, xent_regularize,
                                 leaky_hmm_coefficient, run_opts, wait = False):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
@@ -213,9 +228,10 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
   "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs ark:-| nnet3-chain-merge-egs ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir, iter = iter, model = model,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                xent_reg = xent_regularize,
                egs_dir = egs_dir), wait = wait)
@@ -225,11 +241,12 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
   "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs ark:- | nnet3-chain-merge-egs ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                model = model,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                xent_reg = xent_regularize,
                egs_dir = egs_dir), wait = wait)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index cd9ebf4c7a3..15679fb4061 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -118,11 +118,7 @@ def GetArgs():
                         " chain model's output")
     parser.add_argument("--chain.left-deriv-truncate", type=int,
                         dest='left_deriv_truncate',
-                        default = None, help="")
-    parser.add_argument("--chain.right-deriv-truncate", type=int,
-                        dest='right_deriv_truncate',
-                        default = None, help="")
-
+                        default = None, help="Deprecated. Kept for back compatibility")
 
     # trainer options
     parser.add_argument("--trainer.srand", type=int, dest='srand',
@@ -224,6 +220,14 @@ def GetArgs():
     parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
                         default=512,
                         help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin',
+                        default = None,
+                        help="(Relevant only for recurrent models). If specified, gives the margin "
+                        "(in input frames) around the 'required' part of each chunk that the "
+                        "derivatives are backpropagated to. If unset, the derivatives are "
+                        "backpropagated all the way to the boundaries of the input data. E.g. 8 is "
+                        "a reasonable setting. Note: the 'required' part of the chunk is defined by "
+                        "the model's {left,right}-context.")
 
     # General options
     parser.add_argument("--stage", type=int, default=-4,
@@ -258,7 +262,8 @@ def GetArgs():
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--tree-dir", type=str, required = True,
-                        help="Languade directory")
+                        help="Directory containing the tree to use for this model (we also "
+                        "expect final.mdl and ali.*.gz in that directory")
     parser.add_argument("--lat-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -284,6 +289,12 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
+    if not args.left_deriv_truncate is None:
+        args.deriv_truncate_margin = -args.left_deriv_truncate
+        logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, "
+                "and --trainer.deriv-truncate-margin is set to negative of that value={0}. "
+                "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin))
+
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
         directory which is the output of make_configs.py script""")
@@ -325,9 +336,9 @@ def __init__(self):
 
 
 def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
+                   raw_model_string, egs_dir, left_context, right_context,
                    apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
+                   min_deriv_time, max_deriv_time,
                    l2_regularize, xent_regularize, leaky_hmm_coefficient,
                    momentum, max_param_change,
                    shuffle_buffer_size, num_chunk_per_minibatch,
@@ -340,10 +351,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
       # but we use the same script for consistency with FF-DNN code
 
     deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+    if not min_deriv_time is None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time)
+    if not max_deriv_time is None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time)
 
     processes = []
     for job in range(1,num_jobs+1):
@@ -366,7 +377,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
    "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
                      train_queue_opt = run_opts.train_queue_opt,
@@ -379,11 +390,12 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
                      parallel_train_opts = run_opts.parallel_train_opts,
                      momentum = momentum, max_param_change = max_param_change,
                      raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
+                     egs_dir = egs_dir, lc=left_context, rc=right_context,
+                     archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
                      cache_io_opts = cur_cache_io_opts,
                      num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
+                     wait = False)
 
         processes.append(process_handle)
 
@@ -404,7 +416,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       num_jobs, num_archives_processed, num_archives,
                       learning_rate, shrinkage_value, num_chunk_per_minibatch,
                       num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      left_context, right_context,
+                      apply_deriv_weights, min_deriv_time, max_deriv_time,
                       l2_regularize, xent_regularize, leaky_hmm_coefficient,
                       momentum, max_param_change, shuffle_buffer_size,
                       frame_subsampling_factor, truncate_deriv_weights,
@@ -427,8 +440,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+    chain_lib.ComputeTrainCvProbabilities(dir = dir,
+                                          iter = iter,
+                                          egs_dir = egs_dir,
+                                          left_context = left_context,
+                                          right_context = right_context,
+                                          l2_regularize = l2_regularize,
+                                          xent_regularize = xent_regularize,
+                                          leaky_hmm_coefficient = leaky_hmm_coefficient,
+                                          run_opts = run_opts)
 
     if iter > 0:
         chain_lib.ComputeProgress(dir, iter, run_opts)
@@ -460,15 +480,30 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
       cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
       cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, cur_max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts)
+    TrainNewModels(dir = dir,
+                   iter = iter,
+                   srand = srand,
+                   num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   left_context = left_context,
+                   right_context = right_context,
+                   apply_deriv_weights = apply_deriv_weights,
+                   min_deriv_time = min_deriv_time,
+                   max_deriv_time = max_deriv_time,
+                   l2_regularize = l2_regularize,
+                   xent_regularize = xent_regularize,
+                   leaky_hmm_coefficient = leaky_hmm_coefficient,
+                   momentum = momentum,
+                   max_param_change = cur_max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor = frame_subsampling_factor,
+                   truncate_deriv_weights = truncate_deriv_weights,
+                   cache_io_opts = cache_io_opts,
+                   run_opts = run_opts)
 
     [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
@@ -567,14 +602,15 @@ def Train(args, run_opts):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    egs_left_context = left_context + args.frame_subsampling_factor/2
+    egs_right_context = right_context + args.frame_subsampling_factor/2
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
         # this is where get_egs.sh is called.
         chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
-                                    left_context + args.frame_subsampling_factor/2,
-                                    right_context + args.frame_subsampling_factor/2,
+                                    egs_left_context, egs_right_context,
                                     run_opts,
                                     left_tolerance = args.left_tolerance,
                                     right_tolerance = args.right_tolerance,
@@ -594,7 +630,7 @@ def Train(args, run_opts):
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, egs_left_context, egs_right_context)
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
@@ -638,6 +674,12 @@ def Train(args, run_opts):
                                                                                            args.initial_effective_lrate,
                                                                                            args.final_effective_lrate)
 
+    min_deriv_time = None
+    max_deriv_time = None
+    if not args.deriv_truncate_margin is None:
+        min_deriv_time = -args.deriv_truncate_margin - model_left_context
+        max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin + model_right_context
+
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
@@ -653,18 +695,32 @@ def Train(args, run_opts):
                 shrinkage_value = args.shrink_value
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value,
-                              args.num_chunk_per_minibatch,
-                              num_hidden_layers, args.add_layers_period,
-                              args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate,
-                              args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size,
-                              args.frame_subsampling_factor,
-                              args.truncate_deriv_weights, run_opts)
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value = shrinkage_value,
+                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              apply_deriv_weights = args.apply_deriv_weights,
+                              min_deriv_time = min_deriv_time,
+                              max_deriv_time = max_deriv_time,
+                              l2_regularize = args.l2_regularize,
+                              xent_regularize = args.xent_regularize,
+                              leaky_hmm_coefficient = args.leaky_hmm_coefficient,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              frame_subsampling_factor = args.frame_subsampling_factor,
+                              truncate_deriv_weights = args.truncate_deriv_weights,
+                              run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -683,10 +739,17 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
-                args.num_chunk_per_minibatch, egs_dir,
-                args.leaky_hmm_coefficient, args.l2_regularize,
-                args.xent_regularize, run_opts)
+        chain_lib.CombineModels(dir = args.dir,
+                                num_iters = num_iters,
+                                num_iters_combine = num_iters_combine,
+                                num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                leaky_hmm_coefficient = args.leaky_hmm_coefficient,
+                                l2_regularize = args.l2_regularize,
+                                xent_regularize = args.xent_regularize,
+                                run_opts = run_opts)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index cf755a8d2ec..4bfcb219fc3 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -96,7 +96,7 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""
 
     # Per-component max-change option
     max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''
- 
+
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
 
@@ -111,7 +111,7 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
     self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
     # Per-component max-change option
     max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''
- 
+
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
     components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string))
     components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))
@@ -290,12 +290,12 @@ def AddLstmLayer(config_lines,
                  recurrent_projection_dim = 0,
                  non_recurrent_projection_dim = 0,
                  clipping_threshold = 1.0,
-                 norm_based_clipping = "false",
+                 zeroing_threshold = 3.0,
+                 zeroing_interval = 20,
                  ng_per_element_scale_options = "",
                  ng_affine_options = "",
                  lstm_delay = -1,
                  self_repair_scale_nonlinearity = None,
-                 self_repair_scale_clipgradient = None,
                  max_change_per_component = 0.75):
     assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
     components = config_lines['components']
@@ -320,8 +320,6 @@ def AddLstmLayer(config_lines,
     # self_repair_scale_nonlinearity is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent,
     # i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent
     self_repair_nonlinearity_string = "self-repair-scale={0:.10f}".format(self_repair_scale_nonlinearity) if self_repair_scale_nonlinearity is not None else ''
-    # self_repair_scale_clipgradient is a constant scaling the self-repair vector computed in ClipGradientComponent
-    self_repair_clipgradient_string = "self-repair-scale={0:.2f}".format(self_repair_scale_clipgradient) if self_repair_scale_clipgradient is not None else ''
     # Natural gradient per element scale parameters
     ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
     # Per-component max-change option
@@ -357,7 +355,10 @@ def AddLstmLayer(config_lines,
     components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
     components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
     components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, cell_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
+    components.append("component name={0}_c type=BackpropTruncationComponent dim={1} "
+        "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
+        "recurrence-interval={5}".format(name, cell_dim, clipping_threshold, zeroing_threshold,
+        zeroing_interval, abs(lstm_delay)))
 
     # c1_t and c2_t defined below
     component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name))
@@ -396,7 +397,10 @@ def AddLstmLayer(config_lines,
     if (add_recurrent_projection and add_non_recurrent_projection):
         components.append("# projection matrices : Wrm and Wpm")
         components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options, max_change_options))
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
+        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
+            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
+            "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold,
+            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
         component_nodes.append("# r_t and p_t")
         component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
         component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
@@ -406,8 +410,12 @@ def AddLstmLayer(config_lines,
 
     elif add_recurrent_projection:
         components.append("# projection matrices : Wrm")
-        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options))
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
+        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(
+            name, cell_dim, recurrent_projection_dim, ng_affine_options, max_change_options))
+        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
+            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
+            "recurrence-interval={5}".format(name, recurrent_projection_dim, clipping_threshold,
+            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
         component_nodes.append("# r_t")
         component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name))
         component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
@@ -415,7 +423,10 @@ def AddLstmLayer(config_lines,
         output_dim = recurrent_projection_dim
 
     else:
-        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} {4}".format(name, cell_dim, clipping_threshold, norm_based_clipping, self_repair_clipgradient_string))
+        components.append("component name={0}_r type=BackpropTruncationComponent dim={1} "
+            "clipping-threshold={2} zeroing-threshold={3} zeroing-interval={4} "
+            "recurrence-interval={5}".format(name, cell_dim, clipping_threshold,
+            zeroing_threshold, zeroing_interval, abs(lstm_delay)))
         component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name))
         output_descriptor = '{0}_r_t'.format(name)
         output_dim = cell_dim
@@ -430,29 +441,41 @@ def AddBLstmLayer(config_lines,
                   recurrent_projection_dim = 0,
                   non_recurrent_projection_dim = 0,
                   clipping_threshold = 1.0,
-                  norm_based_clipping = "false",
+                  zeroing_threshold = 3.0,
+                  zeroing_interval = 20,
                   ng_per_element_scale_options = "",
                   ng_affine_options = "",
                   lstm_delay = [-1,1],
                   self_repair_scale_nonlinearity = None,
-                  self_repair_scale_clipgradient = None,
                   max_change_per_component = 0.75):
     assert(len(lstm_delay) == 2 and lstm_delay[0] < 0 and lstm_delay[1] > 0)
-    output_forward = AddLstmLayer(config_lines, "{0}_forward".format(name), input, cell_dim,
-                                  recurrent_projection_dim, non_recurrent_projection_dim,
-                                  clipping_threshold, norm_based_clipping,
-                                  ng_per_element_scale_options, ng_affine_options,
+    output_forward = AddLstmLayer(config_lines = config_lines,
+                                  name = "{0}_forward".format(name),
+                                  input = input,
+                                  cell_dim = cell_dim,
+                                  recurrent_projection_dim = recurrent_projection_dim,
+                                  non_recurrent_projection_dim = non_recurrent_projection_dim,
+                                  clipping_threshold = clipping_threshold,
+                                  zeroing_threshold = zeroing_threshold,
+                                  zeroing_interval = zeroing_interval,
+                                  ng_per_element_scale_options = ng_per_element_scale_options,
+                                  ng_affine_options = ng_affine_options,
                                   lstm_delay = lstm_delay[0],
                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
-                                  self_repair_scale_clipgradient = self_repair_scale_clipgradient,
                                   max_change_per_component = max_change_per_component)
-    output_backward = AddLstmLayer(config_lines, "{0}_backward".format(name), input, cell_dim,
-                                   recurrent_projection_dim, non_recurrent_projection_dim,
-                                   clipping_threshold, norm_based_clipping,
-                                   ng_per_element_scale_options, ng_affine_options,
+    output_backward = AddLstmLayer(config_lines = config_lines,
+                                   name = "{0}_backward".format(name),
+                                   input = input,
+                                   cell_dim = cell_dim,
+                                   recurrent_projection_dim = recurrent_projection_dim,
+                                   non_recurrent_projection_dim = non_recurrent_projection_dim,
+                                   clipping_threshold = clipping_threshold,
+                                   zeroing_threshold = zeroing_threshold,
+                                   zeroing_interval = zeroing_interval,
+                                   ng_per_element_scale_options = ng_per_element_scale_options,
+                                   ng_affine_options = ng_affine_options,
                                    lstm_delay = lstm_delay[1],
                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
-                                   self_repair_scale_clipgradient = self_repair_scale_clipgradient,
                                    max_change_per_component = max_change_per_component)
     output_descriptor = 'Append({0}, {1})'.format(output_forward['descriptor'], output_backward['descriptor'])
     output_dim = output_forward['dimension'] + output_backward['dimension']
@@ -461,4 +484,4 @@ def AddBLstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
- 
+
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index 2290c4d2e7f..2a6499090e2 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -90,11 +90,12 @@ def GetDotNodeName(name_string, is_component = False):
     # this function is required as dot does not allow all the component names
     # allowed by nnet3.
     # Identified incompatibilities :
-    #   1. dot does not allow hyphen(-) in names
+    #   1. dot does not allow hyphen(-) and dot(.) in names
     #   2. Nnet3 names can be shared among components and component nodes
     #      dot does not allow common names
     #
     node_name_string = re.sub("-", "hyphen", name_string)
+    node_name_string = re.sub("\.", "_dot_", node_name_string)
     if is_component:
         node_name_string += node_name_string.strip() + "_component"
     return {"label":name_string, "node":node_name_string}
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 8e6e3d8e0e2..01f84484a92 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -77,13 +77,21 @@ def GetArgs():
 
     # Gradient clipper options
     parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
-                        help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
+                        help="Outdated option retained for back compatibility, has no effect.",
+                        default=True, choices = ["false", "true"])
     parser.add_argument("--clipping-threshold", type=float,
-                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
+                        help="clipping threshold used in BackpropTruncation components, "
+                        "if clipping-threshold=0 no clipping is done", default=30)
+    parser.add_argument("--zeroing-threshold", type=float,
+                        help="zeroing threshold used in BackpropTruncation components, "
+                        "if zeroing-threshold=0 no periodic zeroing is done", default=3.0)
+    parser.add_argument("--zeroing-interval", type=int,
+                        help="zeroing interval used in BackpropTruncation components", default=20)
     parser.add_argument("--self-repair-scale-nonlinearity", type=float,
                         help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=0.00001)
     parser.add_argument("--self-repair-scale-clipgradient", type=float,
-                        help="A non-zero value activates the self-repair mechanism in the ClipGradient component of the LSTM", default=1.0)
+                        help="Outdated option retained for back compatibility, has no effect.",
+                        default=1.0)
 
     # Delay options
     parser.add_argument("--label-delay", type=int, default=None,
@@ -133,8 +141,10 @@ def CheckArgs(args):
 
     if (args.num_lstm_layers < 1):
         sys.exit("--num-lstm-layers has to be a positive integer")
-    if (args.clipping_threshold < 0):
-        sys.exit("--clipping-threshold has to be a non-negative")
+    if (args.clipping_threshold < 0 or args.zeroing_threshold < 0):
+        sys.exit("--clipping-threshold and --zeroing-threshold have to be non-negative")
+    if not args.zeroing_interval > 0:
+        raise Exception("--zeroing-interval has to be positive")
     if args.lstm_delay is None:
         args.lstm_delay = [[-1]] * args.num_lstm_layers
     else:
@@ -221,7 +231,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                 splice_indexes, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
                 num_lstm_layers, num_hidden_layers,
-                norm_based_clipping, clipping_threshold,
+                norm_based_clipping, clipping_threshold, zeroing_threshold, zeroing_interval,
                 ng_per_element_scale_options, ng_affine_options,
                 label_delay, include_log_softmax, xent_regularize,
                 self_repair_scale_nonlinearity, self_repair_scale_clipgradient,
@@ -243,22 +253,34 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
 
     for i in range(num_lstm_layers):
         if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
-            prev_layer_output = nodes.AddBLstmLayer(config_lines, "BLstm{0}".format(i+1),
-                                                    prev_layer_output, cell_dim,
-                                                    recurrent_projection_dim, non_recurrent_projection_dim,
-                                                    clipping_threshold, norm_based_clipping,
-                                                    ng_per_element_scale_options, ng_affine_options,
+            prev_layer_output = nodes.AddBLstmLayer(config_lines = config_lines,
+                                                    name = "BLstm{0}".format(i+1),
+                                                    input = prev_layer_output,
+                                                    cell_dim = cell_dim,
+                                                    recurrent_projection_dim = recurrent_projection_dim,
+                                                    non_recurrent_projection_dim = non_recurrent_projection_dim,
+                                                    clipping_threshold = clipping_threshold,
+                                                    zeroing_threshold = zeroing_threshold,
+                                                    zeroing_interval = zeroing_interval,
+                                                    ng_per_element_scale_options = ng_per_element_scale_options,
+                                                    ng_affine_options = ng_affine_options,
                                                     lstm_delay = lstm_delay[i],
-                                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient,
+                                                    self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                                     max_change_per_component = max_change_per_component)
         else: # add a uni-directional LSTM layer
-            prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1),
-                                                   prev_layer_output, cell_dim,
-                                                   recurrent_projection_dim, non_recurrent_projection_dim,
-                                                   clipping_threshold, norm_based_clipping,
-                                                   ng_per_element_scale_options, ng_affine_options,
+            prev_layer_output = nodes.AddLstmLayer(config_lines = config_lines,
+                                                   name = "Lstm{0}".format(i+1),
+                                                   input = prev_layer_output,
+                                                   cell_dim = cell_dim,
+                                                   recurrent_projection_dim = recurrent_projection_dim,
+                                                   non_recurrent_projection_dim = non_recurrent_projection_dim,
+                                                   clipping_threshold = clipping_threshold,
+                                                   zeroing_threshold = zeroing_threshold,
+                                                   zeroing_interval = zeroing_interval,
+                                                   ng_per_element_scale_options = ng_per_element_scale_options,
+                                                   ng_affine_options = ng_affine_options,
                                                    lstm_delay = lstm_delay[i][0],
-                                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient,
+                                                   self_repair_scale_nonlinearity = self_repair_scale_nonlinearity,
                                                    max_change_per_component = max_change_per_component)
         # make the intermediate config file for layerwise discriminative
         # training
@@ -336,6 +358,8 @@ def Main():
                 num_hidden_layers = num_hidden_layers,
                 norm_based_clipping = args.norm_based_clipping,
                 clipping_threshold = args.clipping_threshold,
+                zeroing_threshold = args.zeroing_threshold,
+                zeroing_interval = args.zeroing_interval,
                 ng_per_element_scale_options = args.ng_per_element_scale_options,
                 ng_affine_options = args.ng_affine_options,
                 label_delay = args.label_delay,
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
new file mode 100644
index 00000000000..e6dc907fe0a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
@@ -0,0 +1 @@
+# This module will house the latest training libraries being written by Vimal
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
index c36de8c16bf..06ccf9657be 100755
--- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
+++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
@@ -17,6 +17,7 @@ if [ $# != 3 ]; then
   echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png"
   echo ""
   echo "Main options (for others, see top of script file)"
+  echo "  --info-bin <nnet3-am-info|nnet3-info>        # Name of the binary to generate the nnet3 file"
   echo "  --component-attributes <string|name,type>     # attributes to be printed in nnet3 components"
   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
   echo "                                                # will be clustered together in the dot-graph"
@@ -34,6 +35,7 @@ $info_bin $model | \
   steps/nnet3/dot/nnet3_to_dot.py \
     --component-attributes "$component_attributes" \
     $attr $dot_file
+echo "Generated the dot file $dot_file"
 
 command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
 dot -Tpdf $dot_file -o $output_file
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index a43aa05176b..e92ab05a847 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -252,7 +252,10 @@ def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
             raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
 
         if (egs_left_context < left_context) or (egs_right_context < right_context):
-            raise Exception('The egs have insufficient context')
+            raise Exception('The egs have insufficient context.'
+                            ' Required left context is {rlc} and available left context is {alc}.'
+                            ' Required right context is {rrc} and available right context is {arc}.'.format(rlc = left_context, alc = egs_left_context,
+                                                                                                            rrc = right_context, arc = egs_right_context))
 
         frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
         num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
@@ -506,52 +509,65 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False):
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, left_context, right_context,
+                                run_opts, mb_size=256, wait = False):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
 
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
   nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+        "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/valid_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
   nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-       "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
+        "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
 
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
+def ComputeProgress(dir, iter, egs_dir, left_context, right_context,
+                    run_opts, mb_size=256, wait=False):
 
     prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
     model = '{0}/{1}.mdl'.format(dir, iter)
+
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
 nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
 nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
-"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
+"ark,bg:nnet3-copy-egs {context_opts}  ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:-|"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                model = model,
                mb_size = mb_size,
                prev_model = prev_model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
-                  run_opts, chunk_width = None):
+                  run_opts, left_context, right_context, chunk_width = None):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -570,26 +586,39 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     else:
         mbsize = 1024
 
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
 nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+   --verbose=3 {raw_models} "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/combine.egs ark:- | \
+   nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:- ark:-|" \
 "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
     """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
                dir = dir, raw_models = " ".join(raw_model_strings),
                mbsize = mbsize,
                num_iters = num_iters,
+               context_opts = context_opts,
                egs_dir = egs_dir))
 
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = 'combined',
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                run_opts = run_opts,
+                                wait = False)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts):
+                            prior_subset_size, left_context, right_context,
+                            run_opts):
     # Note: this just uses CPUs, using a smallish subset of data.
     """ Computes the average posterior of the network"""
     import glob
@@ -601,19 +630,24 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     else:
         egs_part = 'JOB'
 
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
-    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:- ark:- \| \
     nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
-matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
+    "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
     """.format(command = run_opts.command,
                dir = dir,
                num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
                prior_queue_opt = run_opts.prior_queue_opt,
                iter = iter, prior_subset_size = prior_subset_size,
                egs_dir = egs_dir, egs_part = egs_part,
+               context_opts = context_opts,
                prior_gpu_opt = run_opts.prior_gpu_opt))
 
     # make sure there is time for $dir/post.{iter}.*.vec to appear.
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index ea8f41749da..26ca16c364b 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -102,13 +102,23 @@ def Compile(self):
         lat_file.close()
         logger.info("Compiling the latex report.")
         try:
-            proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            proc = subprocess.Popen(['pdflatex', '-interaction=batchmode', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             proc.communicate()
         except Exception as e:
             logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file))
             return False
         return True
 
+def LatexCompliantName(name_string):
+    # this function is required as latex does not allow all the component names
+    # allowed by nnet3.
+    # Identified incompatibilities :
+    #   1. latex does not allow dot(.) in file names
+    #
+    node_name_string = re.sub("\.", "_dot_", name_string)
+
+    return node_name_string
+
 def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None):
     assert(start_iter >= 1)
 
@@ -240,7 +250,8 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name))
@@ -317,7 +328,8 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Clipped-proportion value at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Clipped proportion at {0}".format(component_name))
@@ -417,7 +429,8 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None,
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Parameter differences at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index e4a9e617e48..4139d446872 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -359,10 +359,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+    ComputeTrainCvProbabilities(dir=dir, iter=iter, egs_dir=egs_dir,
+                                left_context=left_context, right_context=right_context,
+                                run_opts=run_opts)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts)
+        ComputeProgress(dir=dir, iter=iter, egs_dir=egs_dir,
+                        left_context=left_context, right_context=right_context,
+                        run_opts=run_opts)
 
     if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
 
@@ -578,14 +582,24 @@ def Train(args, run_opts):
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              args.minibatch_size, args.frames_per_eg,
-                              num_hidden_layers, args.add_layers_period,
-                              left_context, right_context,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size, run_opts)
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              minibatch_size = args.minibatch_size,
+                              frames_per_eg = args.frames_per_eg,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -604,12 +618,24 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+        CombineModels(dir = args.dir,
+                      num_iters = num_iters,
+                      num_iters_combine = num_iters_combine,
+                      egs_dir = egs_dir,
+                      left_context = left_context,
+                      right_context = right_context,
+                      run_opts = run_opts)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = ComputeAveragePosterior(dir = args.dir,
+                                                    iter = 'combined',
+                                                    egs_dir = egs_dir,
+                                                    num_archives = num_archives,
+                                                    prior_subset_size = args.prior_subset_size,
+                                                    left_context = left_context,
+                                                    right_context = right_context,
+                                                    run_opts = run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 7ac7a58a3d5..89db4276cfc 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -194,7 +194,7 @@ def GetArgs():
                         help="Number of sequences to be processed in parallel every minibatch" )
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
                         default=None,
-                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." )
 
     # General options
     parser.add_argument("--stage", type=int, default=-4,
@@ -346,7 +346,7 @@ def __init__(self):
 
 def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
+                   left_context, right_context, min_deriv_time, max_deriv_time,
                    momentum, max_param_change,
                    shuffle_buffer_size, num_chunk_per_minibatch,
                    cache_read_opt, run_opts):
@@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \
   "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
@@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
                      parallel_train_opts = run_opts.parallel_train_opts,
                      cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
                      momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
+                     min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time,
                      raw_model = raw_model_string, context_opts = context_opts,
                      egs_dir = egs_dir, archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
@@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       num_jobs, num_archives_processed, num_archives,
                       learning_rate, shrinkage_value, num_chunk_per_minibatch,
                       num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
+                      left_context, right_context, min_deriv_time, max_deriv_time,
                       momentum, max_param_change, shuffle_buffer_size,
                       cv_minibatch_size, run_opts):
     # Set off jobs doing some diagnostics, in the background.
@@ -430,10 +430,22 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.close()
 
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = iter,
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                run_opts = run_opts,
+                                mb_size=cv_minibatch_size)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
+        ComputeProgress(dir = dir,
+                        iter = iter,
+                        egs_dir = egs_dir,
+                        left_context = left_context,
+                        right_context = right_context,
+                        run_opts = run_opts,
+                        mb_size=cv_minibatch_size)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -467,12 +479,24 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
+    TrainNewModels(dir = dir,
+                   iter = iter,
+                   srand = srand,
+                   num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   left_context = left_context,
+                   right_context = right_context,
+                   min_deriv_time = min_deriv_time,
+                   max_deriv_time = max_deriv_time,
+                   momentum = momentum,
+                   max_param_change = max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   cache_read_opt = cache_read_opt,
+                   run_opts = run_opts)
     [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
@@ -627,11 +651,13 @@ def Train(args, run_opts):
     cur_egs_dir=egs_dir
 
     if args.num_bptt_steps is None:
-        num_bptt_steps = args.chunk_width
+        # num_bptt_steps is set to (chunk_width + 10) by default
+        num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context)
     else:
         num_bptt_steps = args.num_bptt_steps
 
     min_deriv_time = args.chunk_width - num_bptt_steps
+    max_deriv_time = num_bptt_steps - 1
 
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
@@ -672,6 +698,7 @@ def Train(args, run_opts):
                               left_context = left_context,
                               right_context = right_context,
                               min_deriv_time = min_deriv_time,
+                              max_deriv_time = max_deriv_time,
                               momentum = args.momentum,
                               max_param_change= args.max_param_change,
                               shuffle_buffer_size = args.shuffle_buffer_size,
@@ -696,13 +723,25 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width)
+        CombineModels(dir = args.dir,
+                      num_iters = num_iters,
+                      num_iters_combine = num_iters_combine,
+                      egs_dir = egs_dir,
+                      left_context = left_context,
+                      right_context = right_context,
+                      run_opts = run_opts,
+                      chunk_width = args.chunk_width)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = ComputeAveragePosterior(dir = args.dir,
+                                                    iter = 'combined',
+                                                    egs_dir = egs_dir,
+                                                    num_archives = num_archives,
+                                                    prior_subset_size = args.prior_subset_size,
+                                                    left_context = left_context,
+                                                    right_context = right_context,
+                                                    run_opts = run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
new file mode 100755
index 00000000000..e29a9404403
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import shlex
+import sys
+import warnings
+import copy
+import imp
+import ast
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+# do the proper import when python scripts have been refactored
+nnet3_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py')
+
+def get_args():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description='Reads an xconfig file and creates config files '
+                                     'for neural net creation and training',
+                                     epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--config-dir', required=True,
+                        help='Directory to write config files and variables')
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = check_args(args)
+
+    return args
+
+def check_args(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+    return args
+
+
+
+
+def backup_xconfig_file(xconfig_file, config_dir):
+    # we write a copy of the xconfig file just to have a record of the original
+    # input.
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig for output'.format(
+            sys.argv[0], config_dir))
+    try:
+        xconfig_file_in = open(xconfig_file)
+    except:
+        sys.exit('{0}: error opening file {1} for input'.format(sys.argv[0], config_dir))
+
+    print("# This file was created by the command:\n"
+          "# {0}\n"
+          "# It is a copy of the source from which the config files in "
+          "# this directory were generated.\n".format(' '.join(sys.argv)),
+          file=xconfig_file_out)
+
+    while True:
+        line = xconfig_file_in.readline()
+        if line == '':
+            break
+        print(line.strip(), file=xconfig_file_out)
+    xconfig_file_out.close()
+    xconfig_file_in.close()
+
+
+# This functions writes config_dir/xconfig.expanded.1 and
+# config_dir/xconfig.expanded.2, showing some of the internal stages of
+# processing the xconfig file before turning it into config files.
+def write_expanded_xconfig_files(config_dir, all_layers):
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig.expanded.1 for output'.format(
+            sys.argv[0], config_dir))
+
+
+    print('# This file was created by the command:\n'
+          '# ' + ' '.join(sys.argv) + '\n'
+          '#It contains the same content as ./xconfig but it was parsed and\n'
+          '#default config values were set.\n'
+          '# See also ./xconfig.expanded.2\n', file=xconfig_file_out)
+
+    for layer in all_layers:
+        print(str(layer), file=xconfig_file_out)
+    xconfig_file_out.close()
+
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig.expanded.2 for output'.format(
+                sys.argv[0], config_dir))
+
+    print('# This file was created by the command:\n'
+          '# ' + ' '.join(sys.argv) + '\n'
+          '# It contains the same content as ./xconfig but it was parsed,\n'
+          '# default config values were set, and Descriptors (input=xxx) were normalized.\n'
+          '# See also ./xconfig.expanded.1\n\n',
+          file=xconfig_file_out)
+
+    for layer in all_layers:
+        layer.normalize_descriptors()
+        print(str(layer), file=xconfig_file_out)
+    xconfig_file_out.close()
+
+# This function returns a map from config-file basename
+# e.g. 'init', 'ref', 'layer1' to a documentation string that goes
+# at the top of the file.
+def get_config_headers():
+    ans = defaultdict(str)  # resulting dict will default to the empty string
+                            # for any config files not explicitly listed here.
+    ans['init'] = ('# This file was created by the command:\n'
+                   '# ' + ' '.join(sys.argv) + '\n'
+                   '# It contains the input of the network and is used in\n'
+                   '# accumulating stats for an LDA-like transform of the\n'
+                   '# input features.\n');
+    ans['ref'] = ('# This file was created by the command:\n'
+                  '# ' + ' '.join(sys.argv) + '\n'
+                  '# It contains the entire neural network, but with those\n'
+                  '# components that would normally require fixed vectors/matrices\n'
+                  '# read from disk, replaced with random initialization\n'
+                  '# (this applies to the LDA-like transform and the\n'
+                  '# presoftmax-prior-scale, if applicable).  This file\n'
+                  '# is used only to work out the left-context and right-context\n'
+                  '# of the network.\n');
+    ans['final'] = ('# This file was created by the command:\n'
+                    '# ' + ' '.join(sys.argv) + '\n'
+                    '# It contains the entire neural network.\n')
+
+    return ans;
+
+
+
+
+# This is where most of the work of this program happens.
+def write_config_files(config_dir, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    config_basename_to_header = get_config_headers()
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                    "line '{1}': error was: {2}".format(sys.argv[0], str(layer),
+                                                        repr(e)), file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    for basename,lines in config_basename_to_lines.items():
+        header = config_basename_to_header[basename]
+        filename = '{0}/{1}.config'.format(config_dir, basename)
+        try:
+            f = open(filename, 'w')
+            print(header, file=f)
+            for line in lines:
+                print(line, file=f)
+            f.close()
+        except Exception as e:
+            print('{0}: error writing to config file {1}: error is {2}'.format(
+                    sys.argv[0], filename, repr(e)), file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+def add_back_compatibility_info(config_dir):
+    """This will be removed when python script refactoring is done."""
+
+    nnet3_lib.RunKaldiCommand("nnet3-init {0}/ref.config {0}/ref.raw".format(config_dir))
+    out, err = nnet3_lib.RunKaldiCommand("nnet3-info {0}/ref.raw | head -4".format(config_dir))
+    #out looks like this
+    # left-context: 7
+    # right-context: 0
+    # num-parameters: 90543902
+    # modulus: 1
+    info = {}
+    for line in out.split("\n"):
+        parts = line.split(":")
+        if len(parts) != 2:
+            continue
+        info[parts[0].strip()] = int(parts[1].strip())
+
+    # Writing the back-compatible vars file
+    #   model_left_context=0
+    #   model_right_context=7
+    #   num_hidden_layers=3
+    vf = open('{0}/vars'.format(config_dir), 'w')
+    vf.write('model_left_context={0}\n'.format(info['left-context']))
+    vf.write('model_right_context={0}\n'.format(info['right-context']))
+    vf.write('num_hidden_layers=1\n')
+    vf.close()
+
+    nnet3_lib.ForceSymlink("final.config".format(config_dir),
+                           "{0}/layer1.config".format(config_dir))
+
+def main():
+    args = get_args()
+    backup_xconfig_file(args.xconfig_file, args.config_dir)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file)
+    write_expanded_xconfig_files(args.config_dir, all_layers)
+    write_config_files(args.config_dir, all_layers)
+    add_back_compatibility_info(args.config_dir)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+#  mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+
+# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'relu-renorm-layer name=affine1 dim=1024'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+
+# mkdir -p foo; (echo 'input dim=100 name=ivector'; echo 'input dim=40 name=input'; echo 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
diff --git a/egs/wsj/s5/utils/lang/make_unk_lm.sh b/egs/wsj/s5/utils/lang/make_unk_lm.sh
index f92d02ffc43..b46ab128b93 100755
--- a/egs/wsj/s5/utils/lang/make_unk_lm.sh
+++ b/egs/wsj/s5/utils/lang/make_unk_lm.sh
@@ -258,7 +258,7 @@ if ! $position_dependent_phones; then
     # We don't need to take into account the disambig symbol because we compose on
     # the right with this FST, and it doesn't appear on the output side.
     cat $dir/all_nonsil_phones | \
-      awk -v '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; }
+      awk '{ph[$1]=1} END{ for (p in ph) { print 0,1,p,p; print 1,2,p,p; print 2,2,p,p; }
                  print 2,0.0; }' > $dir/constraint_fst.txt
   fi
 else
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index ea5264a0f07..054210cdd23 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -51,7 +51,6 @@
 # Begin configuration section.
 num_sil_states=5
 num_nonsil_states=3
-num_word_disambig_syms=1
 position_dependent_phones=true
 # position_dependent_phones is false also when position dependent phones and word_boundary.txt
 # have been generated by another source
diff --git a/src/INSTALL b/src/INSTALL
index e0fdcc81e60..3f7a01928ba 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -7,14 +7,13 @@ You must first have completed the installation steps in ../tools/INSTALL
 (compiling OpenFst; getting ATLAS and CLAPACK headers).
 
 The installation instructions are:
-./configure
+./configure --shared
 make depend
 make
 
 Note that "make" takes a long time; you can speed it up by running make
-in parallel if you have multiple CPUs, for instance 
+in parallel if you have multiple CPUs, for instance
  make depend -j 8
  make -j 8
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".
-
diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc
index 52719cc4669..8d6e6164eac 100644
--- a/src/base/kaldi-math-test.cc
+++ b/src/base/kaldi-math-test.cc
@@ -57,6 +57,17 @@ void UnitTestRoundUpToNearestPowerOfTwo() {
   KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824);
 }
 
+void UnitTestDivideRoundingDown() {
+  for (int32 i = 0; i < 100; i++) {
+    int32 a = RandInt(-100, 100);
+    int32 b = 0;
+    while (b == 0)
+      b = RandInt(-100, 100);
+    KALDI_ASSERT(DivideRoundingDown(a, b) ==
+        std::floor(static_cast<double>(a) / static_cast<double>(b)));
+  }
+}
+
 void UnitTestGcdLcm() {
   UnitTestGcdLcmTpl<int>();
   UnitTestGcdLcmTpl<size_t>();
@@ -314,6 +325,7 @@ int main() {
   UnitTestRand();
   UnitTestAssertFunc();
   UnitTestRoundUpToNearestPowerOfTwo();
+  UnitTestDivideRoundingDown();
   UnitTestExpSpeed<float>();
   UnitTestExpSpeed<double>();
   UnitTestLogSpeed<float>();
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index ac590a06a25..3ee6fe4ccf9 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -280,6 +280,17 @@ static inline void AssertEqual(float a, float b,
 // RoundUpToNearestPowerOfTwo does the obvious thing. It crashes if n <= 0.
 int32 RoundUpToNearestPowerOfTwo(int32 n);
 
+/// Returns a / b, rounding towards negative infinity in all cases.
+static inline int32 DivideRoundingDown(int32 a, int32 b) {
+  KALDI_ASSERT(b != 0);
+  if (a * b >= 0)
+    return a / b;
+  else if (a < 0)
+    return (a - b + 1) / b;
+  else
+    return (a - b - 1) / b;
+}
+
 template<class I> I  Gcd(I m, I n) {
   if (m == 0 || n == 0) {
     if (m == 0 && n == 0) {  // gcd not defined, as all integers are divisors.
diff --git a/src/bin/acc-tree-stats.cc b/src/bin/acc-tree-stats.cc
index 90432c2e58a..8b9ce9065b4 100644
--- a/src/bin/acc-tree-stats.cc
+++ b/src/bin/acc-tree-stats.cc
@@ -128,5 +128,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc
index 833e29efe3b..616dac858d7 100644
--- a/src/bin/align-text.cc
+++ b/src/bin/align-text.cc
@@ -58,9 +58,11 @@ int main(int argc, char *argv[]) {
     po.Register("special-symbol", &special_symbol, "Special symbol to be "
                 "aligned with the inserted or deleted words. Your sentences "
                 "should not contain this symbol.");
-    po.Register("separator", &separator, "Separator for each aligned pairs in "
-                "the output alignment file. Your sentences should not contain "
-                "this symbol.");
+    po.Register("separator", &separator, "Separator for each aligned pair in "
+                "the output alignment file.  Note: it should not be necessary "
+                "to change this even if your sentences contain ';', because "
+                "to parse the output of this program you can just split on "
+                "space and then assert that every third token is ';'.");
 
     po.Read(argc, argv);
 
@@ -91,16 +93,12 @@ int main(int argc, char *argv[]) {
       const std::vector<std::string> &text1 = text1_reader.Value();
       const std::vector<std::string> &text2 = text2_reader.Value(key);
 
-      // Checks if the special symbol and separator is in the string.
+      // Checks if the special symbol is in the string.
       KALDI_ASSERT(std::find(text1.begin(),
                              text1.end(), special_symbol) == text1.end());
       KALDI_ASSERT(std::find(text2.begin(),
                              text2.end(), special_symbol) == text2.end());
-      KALDI_ASSERT(std::find(text1.begin(),
-                             text1.end(), separator) == text1.end());
-      KALDI_ASSERT(std::find(text2.begin(),
-                             text2.end(), separator) == text2.end());
-    
+
       if (std::find_if(text1.begin(), text1.end(), IsNotToken) != text1.end()) {
         KALDI_ERR << "In text1, the utterance " << key << " contains unprintable characters." \
           << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
@@ -111,7 +109,7 @@ int main(int argc, char *argv[]) {
           << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
         return  -1;
       }
-      
+
       std::vector<std::pair<std::string, std::string> > aligned;
       LevenshteinAlignment(text1, text2, special_symbol, &aligned);
 
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index b092b3de4d7..6f494a0c562 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -220,11 +220,6 @@ static void SortOnTransitionCount(fst::StdVectorFst *fst) {
 
 void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
   for (int32 i = 1; i <= 3; i++) {
-    fst::PushSpecial(fst, fst::kDelta * 0.01);
-    MinimizeAcceptorNoPush(fst);
-    KALDI_LOG << "Number of states and arcs in transition-id FST after regular "
-              << "minimization is " << fst->NumStates() << " and "
-              << NumArcs(*fst) << " (pass " << i << ")";
     fst::StdVectorFst fst_reversed;
     fst::Reverse(*fst, &fst_reversed);
     fst::PushSpecial(&fst_reversed, fst::kDelta * 0.01);
@@ -233,6 +228,11 @@ void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
     KALDI_LOG << "Number of states and arcs in transition-id FST after reversed "
               << "minimization is " << fst->NumStates() << " and "
               << NumArcs(*fst) << " (pass " << i << ")";
+    fst::PushSpecial(fst, fst::kDelta * 0.01);
+    MinimizeAcceptorNoPush(fst);
+    KALDI_LOG << "Number of states and arcs in transition-id FST after regular "
+              << "minimization is " << fst->NumStates() << " and "
+              << NumArcs(*fst) << " (pass " << i << ")";
   }
   fst::RmEpsilon(fst);
   KALDI_LOG << "Number of states and arcs in transition-id FST after "
@@ -347,7 +347,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
 
   BaseFloat self_loop_scale = 1.0;  // We have to be careful to use the same
                                     // value in test time.
-  bool reorder = false;
+  bool reorder = true;
   // add self-loops to the FST with transition-ids as its labels.
   AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
                &transition_id_fst);
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 640040c60f3..f093f21a5a5 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -20,21 +20,40 @@
 #include <cfloat>
 #include "chain/chain-kernels-ansi.h"
 
-template <typename Real>
-__device__ inline void atomic_add(Real* address, Real value) {
-  atomicAdd(address, value);
-}
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 200
+#error - Kaldi no longer supports CC1.x devices. Please use a newer GPU or \
+         configure with --use-cuda=no (this will disable the use of GPU).
+#endif
 
-template<>
-__device__ inline void atomic_add(double* address, double val) {
-  unsigned long long int* address_as_ull =
-    reinterpret_cast<unsigned long long int*>(address);
+
+#ifdef __CUDACC__
+#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
+// native implementation available
+#else
+#if __CUDA_ARCH__ >= 600
+#error using CAS implementation of double atomicAdd
+#endif
+__device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*) address;
   unsigned long long int old = *address_as_ull, assumed;
+
   do {
     assumed = old;
     old = atomicCAS(address_as_ull, assumed,
                     __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+#endif
+
+
+template <typename Real>
+__device__ inline void atomic_add(Real* address, Real value) {
+  atomicAdd(address, value);
 }
 
 template <typename Real>
@@ -268,4 +287,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
-
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 00ed56308b3..b0c963595a1 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -316,15 +316,15 @@ int main(int argc, char *argv[]) {
           num_written++;
         }
       } else if (count > 0) {
-        const NnetChainExample &eg = example_reader.Value();
+        NnetChainExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
         NnetChainExample eg_out;
         if (left_context != -1 || right_context != -1)
           ModifyChainExampleContext(eg, left_context, right_context,
                                     frame_subsampling_factor, &eg_out);
         else
-          eg_out = eg;
-        if (frame_shift != 0)
-          ShiftChainExampleTimes(frame_shift, exclude_names, &eg_out);
+          eg_out.Swap(&eg);
         if (truncate_deriv_weights != 0)
           TruncateDerivWeights(truncate_deriv_weights, &eg_out);
         for (int32 c = 0; c < count; c++) {
@@ -344,5 +344,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/configure b/src/configure
index 1b94d744228..3446a9532e0 100755
--- a/src/configure
+++ b/src/configure
@@ -9,8 +9,8 @@
 
 
 #  Example command lines:
-# ./configure
 # ./configure --shared  ## shared libraries.
+# ./configure
 # ./configure --mkl-root=/opt/intel/mkl
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
@@ -447,8 +447,8 @@ function configure_cuda {
     fi
 
     case $CUDA_VERSION in
-      5_5) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-      6_*) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+      5_5) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+      6_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
       7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
       8_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
       *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 4642048989e..d475143d444 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -220,6 +220,9 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
                       int src_stride);
 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
                        int src_stride, int group_size, float power);
+void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                             const float *x, MatrixDim x_d, float tartget_rms,
+                             bool add_log_stddev);
 void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
                             MatrixDim d, int src_stride, int group_size,
                             float power);
@@ -489,6 +492,9 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
 void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
                        MatrixDim d, int src_stride, int group_size,
                        double power);
+void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
+                             const double *x, MatrixDim x_d, double tartget_rms,
+                             bool add_log_stddev);
 void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
                             MatrixDim d, int src_stride, int group_size,
                             double power);
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index bddd1227441..00f4f14cb66 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,6 +28,8 @@
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 
+
+
 /***********************************************************************
  * Generic __device__ functions
  */
@@ -379,7 +381,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) {
   int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride;
   if (i < dst_d.cols && j < dst_d.rows) {
     Real a = mat[dst_index], b = A[src_index];
-    mat[dst_index] = (a > b ? a : b);
+    mat[dst_index] = fmax(a, b);
   }
 }
 
@@ -890,9 +892,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   // Tree reduce to 2x warpSize elements.
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
+    if (tid < shift)
       ssum[tid] += ssum[tid + shift];
-    }
     __syncthreads();
   }
 
@@ -1248,7 +1249,7 @@ struct TransReduceOp<MAX, Real> {
   }
   __forceinline__
   __device__ Real Reduce(const Real& a, const Real& b) const {
-    return max(a, b);
+    return fmax(a, b);
   }
   __forceinline__
   __device__ Real PostReduce(const Real& x, const Real& output) const {
@@ -1288,7 +1289,7 @@ struct TransReduceOp<LINFNORM, Real> {
   }
   __forceinline__
   __device__ Real Reduce(const Real& a, const Real& b) const {
-    return max(a, b);
+    return fmax(a, b);
   }
   __forceinline__
   __device__ Real PostReduce(const Real& x, const Real& output) const {
@@ -2155,7 +2156,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   // reduce to CU1DBLOCK elements per row.
   Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF;
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
-    tmax = max(tmax, x[x_start + j]);
+    tmax = fmax(tmax, x[x_start + j]);
   }
   smem[tid] = tmax;
   __syncthreads();
@@ -2164,7 +2165,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
     if (tid < shift) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = fmax(smem[tid], smem[tid + shift]);
     }
     __syncthreads();
   }
@@ -2173,7 +2174,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = fmax(smem[tid], smem[tid + shift]);
     }
   }
 
@@ -2217,6 +2218,77 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   }
 }
 
+// The output y_i = scale * x_i,
+// and we want to RMS value of the y_i to equal target_rms,
+// so y^t y = D * target_rms^2 (if y is one row of the input).
+// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+// there is also flooring involved, to avoid division-by-zero
+// problems.  It's important for the backprop, that the floor's
+// square root is exactly representable as float.
+// If add_log_stddev is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
+//
+// 1D grid is used. Each 256-thread block works on 1 row of the data matrix.
+// The block is also of 1D. Strided memory access is used if the length of the
+// row is longer than 256.
+template<typename Real>
+__global__
+static void _normalize_per_row(Real *y, int y_stride, const Real *x,
+                               MatrixDim x_d, Real target_rms,
+                               bool add_log_stddev) {
+  const int i = blockIdx.x;
+  const int tid = threadIdx.x;
+  const Real* x_row = x + i * x_d.stride;
+  __shared__ Real ssum[CU1DBLOCK];
+
+  // Reduce x_j^2 to CU1DBLOCK elements per row
+  Real tsum = Real(0);
+  for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
+    tsum += x_row[j] * x_row[j];
+  }
+  ssum[tid] = tsum;
+  __syncthreads();
+
+  // Tree reduce to 2x warpSize elements per row
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      ssum[tid] += ssum[tid + shift];
+    __syncthreads();
+  }
+
+  // Reduce last warp to 1 element per row.
+  // Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      ssum[tid] += ssum[tid + shift];
+    }
+  }
+
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  if (tid == 0) {
+    ssum[0] = sqrt(
+        fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+  }
+
+  // Broadcast floored stddev to all threads.
+  __syncthreads();
+  const Real stddev_div_target_rms = ssum[0];
+  const Real scale = Real(1) / stddev_div_target_rms;
+
+  // Store normalized input to output
+  Real* y_row = y + i * y_stride;
+  for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
+    y_row[j] = x_row[j] * scale;
+  }
+
+  if (tid == 0 && add_log_stddev) {
+    y_row[x_d.cols] = log(stddev_div_target_rms * target_rms);
+  }
+}
+
+
 // Per-row log-softmax operation on 'x', with writing to 'y'.
 // note, x and y may point to the same memory.  This is equivalent to setting
 // matrix y to matrix x and then, for each row of y, subtracting the offset that
@@ -2240,7 +2312,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   // reduce to CU1DBLOCK elements per row.
   Real tmax = -1e20;
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
-    tmax = max(tmax, x[x_start + j]);
+    tmax = fmax(tmax, x[x_start + j]);
   }
   smem[tid] = tmax;
   __syncthreads();
@@ -2249,7 +2321,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
     if (tid < shift) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = fmax(smem[tid], smem[tid + shift]);
     }
     __syncthreads();
   }
@@ -2257,7 +2329,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   // reduce to 1 element per row
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = fmax(smem[tid], smem[tid + shift]);
     }
   }
 
@@ -3182,6 +3254,12 @@ void cudaF_splice(dim3 Gr, dim3 Bl, float* y, const float* x,
   _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in);
 }
 
+void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                             const float *x, MatrixDim x_d, float target_rms,
+                             bool add_log_stddev) {
+  _normalize_per_row<<<Gr, Bl>>>(y, y_stride, x, x_d, target_rms, add_log_stddev);
+}
+
 void cudaF_one(int Gr, int Bl, float* x, int dim) {
   _one<<<Gr,Bl>>>(x,dim);
 }
@@ -3811,6 +3889,12 @@ void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double* y, const double* x,
   _log_softmax_reduce<<<Gr,Bl>>>(y, x, y_dim, x_stride);
 }
 
+void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
+                             const double *x, MatrixDim x_d, double target_rms,
+                             bool add_log_stddev) {
+  _normalize_per_row<<<Gr, Bl>>>(y, y_stride, x, x_d, target_rms, add_log_stddev);
+}
+
 void cudaD_splice(dim3 Gr, dim3 Bl, double* y, const double* x,
                   const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) {
   _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index a6e81db5d6c..55259cba147 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -590,6 +590,12 @@ inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
                            MatrixDim d) {
   cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
 }
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                                   const float *x, MatrixDim x_d,
+                                   float target_rms, bool add_log_stddev) {
+  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
+}
 inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
                               const float* value, const int value_stride,
                               const float* diff, const int diff_stride) {
@@ -1110,6 +1116,12 @@ inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
                                     int x_stride) {
   cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
 }
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
+                                   int y_stride, const double *x, MatrixDim x_d,
+                                   double target_rms, bool add_log_stddev) {
+  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
+}
 
 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
                                double l1, double lr, MatrixDim d,
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 2e096e76ae8..494c676250b 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -139,6 +139,76 @@ static void UnitTestCuMathSplice() {
   }
 }
 
+
+template<typename Real>
+static void UnitTestCuMathNormalizePerRow() {
+
+  for (int32 i = 0; i < 2; i++) {
+    int row = 10 + Rand() % 40;
+    int col = 10 + Rand() % 50;
+
+    Matrix<Real> Hi(row,col);
+    Matrix<Real> Ho(row,col+1);
+    Hi.SetRandn();
+    Hi.Scale(5.0);
+
+    CuMatrix<Real> Di(row, col);
+    CuMatrix<Real> Do(row, col+1);
+    Di.CopyFromMat(Hi);
+
+    Real target_rms = 0.3456;
+    bool add_log_stddev = true;
+    const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+
+    //gpu
+    cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
+
+    //cpu
+    {
+      MatrixBase<Real>& in(Hi);
+      MatrixBase<Real>& out(Ho);
+      Real target_rms=0.3456;
+      SubMatrix<Real> out_no_log(out, 0, out.NumRows(), 0, in.NumCols());
+      if (in.Data() != out_no_log.Data())
+        out_no_log.CopyFromMat(in);
+      Vector<Real> in_norm(in.NumRows());
+      Real d_scaled = in.NumCols() * target_rms * target_rms;
+      in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+      in_norm.ApplyFloor(kSquaredNormFloor);
+      in_norm.ApplyPow(-0.5);
+      out_no_log.MulRowsVec(in_norm);
+      if (add_log_stddev) {
+        in_norm.ApplyLog();
+        in_norm.Scale(-1.0);
+        in_norm.Add(log(target_rms));
+        out.CopyColFromVec(in_norm, in.NumCols());
+      }
+    }
+
+    Matrix<Real> Ho2(Do);
+    AssertEqual(Ho,Ho2,0.00001);
+  }
+
+  for (int dim = 16; dim <= 1024; dim *= 2) {
+    BaseFloat time_in_secs = 0.025;
+    CuMatrix<Real> M(dim, dim), N(dim, dim + 1);
+    M.SetRandn();
+    N.SetRandn();
+    Timer tim;
+    int32 iter = 0;
+    for (; tim.Elapsed() < time_in_secs; iter++) {
+      cu::NormalizePerRow(M, Real(1), true, &N);
+    }
+
+    BaseFloat gflops = ((BaseFloat) dim * dim * iter)
+        / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For CuMatrix::NormalizePerRow"
+              << (sizeof(Real)==8?"<double>":"<float>") << ", for dim = "
+              << dim << ", speed was " << gflops << " gigaflops.";
+  }
+}
+
+
 template<typename Real> void CudaMathUnitTest() {
   #if HAVE_CUDA == 1  
     if (CuDevice::Instantiate().DoublePrecisionSupported())
@@ -146,6 +216,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestCuMathRandomize<Real>();
   UnitTestCuMathSplice<Real>();
   UnitTestCuMathCopy<Real>();
+  UnitTestCuMathNormalizePerRow<Real>();
 }
 
 
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 97757ba68dd..f01760d41bb 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -29,15 +29,15 @@ namespace kaldi {
 namespace cu {
 
 /*
- * templated functions wrapping the ANSI-C CUDA kernel functions 
+ * templated functions wrapping the ANSI-C CUDA kernel functions
  */
 
 
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
@@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
-        
+
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
-        if (weight2(r, c) < 0.0) 
+        if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
@@ -88,16 +88,16 @@ void Randomize(const CuMatrixBase<Real> &src,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     /*
-    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
+    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
-     * and extend the randomizable matrices to: col 4*65535, row 128*65535 
+     * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
@@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase<Real> &src,
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase<Real> &src,
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
-} 
+}
 
 
 
 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {
-  
+
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -171,7 +171,7 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
 
 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
-          CuMatrixBase<Real> *tgt) { 
+          CuMatrixBase<Real> *tgt) {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
@@ -179,14 +179,14 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -233,9 +233,65 @@ void Randomize(const CuMatrixBase<double> &src,
                const CuArray<int32> &copy_from_idx,
                CuMatrixBase<double> *tgt);
 
+// The output y_i = scale * x_i,
+// and we want to RMS value of the y_i to equal target_rms,
+// so y^t y = D * target_rms^2 (if y is one row of the input).
+// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+// there is also flooring involved, to avoid division-by-zero
+// problems.  It's important for the backprop, that the floor's
+// square root is exactly representable as float.
+// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
+template<typename Real>
+void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
+                     const bool add_log_stddev, CuMatrixBase<Real>* out) {
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  if (add_log_stddev) {
+    KALDI_ASSERT(in.NumRows() == out->NumRows());
+    KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
+  } else {
+    KALDI_ASSERT(SameDim(in, *out));
+  }
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = out->NumRows();
+    cuda_normalize_per_row(dimGrid, dimBlock, out->Data(), out->Stride(),
+                           in.Data(), in.Dim(), target_rms, add_log_stddev);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    CuSubMatrix<Real> out_no_log(*out, 0, out->NumRows(), 0, in.NumCols());
+    if (in.Data() != out_no_log.Data())
+      out_no_log.CopyFromMat(in);
+    CuVector<Real> in_norm(in.NumRows());
+    Real d_scaled = in.NumCols() * target_rms * target_rms;
+    in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    out_no_log.MulRowsVec(in_norm);
+    if (add_log_stddev) {
+      in_norm.ApplyLog();
+      in_norm.Scale(-1.0);
+      in_norm.Add(log(target_rms));
+      out->CopyColFromVec(in_norm, in.NumCols());
+    }
+  }
+}
+
+template
+void NormalizePerRow(const CuMatrixBase<float>& in, const float target_rms,
+                     const bool add_log_stddev, CuMatrixBase<float>* out);
+template
+void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
+                     const bool add_log_stddev, CuMatrixBase<double>* out);
+
 
 
 } //namespace cu
 
 } //namespace kaldi
-
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 65a4c0c4af3..0afbb9476a1 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -78,6 +78,23 @@ void Group2norm(const CuMatrixBase<Real> &src,
                 CuMatrixBase<Real> *dest,
                 int32 group_stride);
 
+/// Normalize nonlinearity modifies the vector of activations
+/// by scaling it so that the root-mean-square equals 1.0.
+///
+/// The output y_i = scale * x_i,
+/// and we want to RMS value of the y_i to equal target_rms,
+/// so y^t y = D * target_rms^2 (if y is one row of the input).
+/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+/// there is also flooring involved, to avoid division-by-zero
+/// problems.  It's important for the backprop, that the floor's
+/// square root is exactly representable as float.
+/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+/// is an extra dimension of the output.
+template<typename Real>
+void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
+                     const bool add_log_stddev, CuMatrixBase<Real>* out);
+
+
 
 
 
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index 30873cfa9b0..5788b95d9c0 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -92,7 +92,17 @@ great loss.
 The pdf-class is a concept that relates to the HmmTopology object.  The
 HmmTopology object specifies a prototype HMM for each phone.  Each
 numbered state of a
-"prototype HMM" has a variable "pdf_class".  If two states have the same
+"prototype HMM" has two variables "forward_pdf_class" and "self_loop_pdf_class".
+The "self_loop_pdf_class" is a kind of pdf-class that is associated
+with self-loop transition. It is by default identical to "forward_pdf_class",
+but it can be used to define less-convectional HMM topologies
+where the pdfs on the self-loop and forward transitions are different.
+The decision to allow the pdf-class on just the self-loop to be different,
+while not embracing a fully "arc-based" representation where the pdfs on
+all transitions in the HMM are potentially independent, was made as a compromise,
+to allow for compatibility with previous versions of Kaldi while supporting the topology
+used in our "chain models" AKA lattice-free MMI.
+If two states have the same
 pdf_class variable, then they will always share the same probability
 distribution function (p.d.f.) if they are in the same phonetic context.  This
 is because the decision-tree code does not get to "see" the HMM-state directly,
@@ -121,11 +131,14 @@ object to get the pdf-ids associated with particular phonetic contexts).
 
 The decision that underlies a lot of the transition-modeling code is as follows:
 we have decided to make the transition probability of a
-context dependent HMM state depend on the following four things (you could view
-them as a 4-tuple):
+context dependent HMM state depend on the following five things (you could view
+them as a 5-tuple):
   - The phone (whose HMM we are in)
   - The source HMM-state (as interpreted by the HmmTopology object, i.e.  normally 0, 1 or 2)
-  - The \ref pdf_id "pdf-id" (i.e. the index of the pdf associated with the state)
+  - The \ref pdf_id "forward-pdf-id"
+    (i.e. the index of the forward transition pdfs associated with the state)
+  - The \ref pdf_id "self-loop-pdf-id"
+    (i.e. the index of the self-loop pdfs associated with the state)
   - The index of the transition in the HmmTopology object.
 
 The last of these four items could be viewed as encoding the destination
@@ -198,7 +211,7 @@ prototype HMM (as given in the HmmTopology object).
       from (transition-state, transition-index) to transition-id, and vice versa.
 
   There are also in the transition-modeling code reference to the following concepts:
-    - A triple means a triple (phone, hmm-state, pdf) which is mappable to and from a transition-state.
+    - A tuple means a 4-tuple (phone, hmm-state, forward pdf, self-loop pdf) which is mappable to and from a transition-state.
     - A pair means a pair (transition-state, transition-index) which is mappable to and from a transition-id.
 
 \section hmm_transition_training Training the transition model
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
index 0034027cbe6..719e55dd6da 100644
--- a/src/feat/feature-plp.cc
+++ b/src/feat/feature-plp.cc
@@ -125,7 +125,7 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
 
   if (opts_.use_energy && !opts_.raw_energy)
     signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
+                                     std::numeric_limits<BaseFloat>::min()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index 2726462d22c..65c0a2a29c3 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -140,7 +140,7 @@ void ProcessWindow(const FrameExtractionOptions &opts,
 
   if (log_energy_pre_window != NULL) {
     BaseFloat energy = std::max(VecVec(*window, *window),
-                                std::numeric_limits<float>::epsilon());
+                                std::numeric_limits<BaseFloat>::epsilon());
     *log_energy_pre_window = Log(energy);
   }
 
diff --git a/src/fstbin/fstdeterminizestar.cc b/src/fstbin/fstdeterminizestar.cc
index ccd70764189..5e3de3e7ef9 100644
--- a/src/fstbin/fstdeterminizestar.cc
+++ b/src/fstbin/fstdeterminizestar.cc
@@ -24,10 +24,12 @@
 #include "fstext/determinize-star.h"
 #include "fstext/fstext-utils.h"
 #include "fstext/kaldi-fst-io.h"
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) && !defined(__APPLE__)
 #include <signal.h> // Comment this line and the call to signal below if
 // it causes compilation problems.  It is only to enable a debugging procedure
-// when determinization does not terminate.  
+// when determinization does not terminate.  We are disabling this code if
+// compiling on Windows because signal.h is not available there, and on
+// MacOS due to a problem with <signal.h> in the initial release of Sierra.
 #endif
 
 /* some test  examples:
@@ -91,7 +93,7 @@ int main(int argc, char *argv[]) {
 
     // This enables us to get traceback info from determinization that is
     // not seeming to terminate.
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) && !defined(__APPLE__)
     signal(SIGUSR1, signal_handler);
 #endif
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
@@ -138,4 +140,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/fstbin/fstpropfinal.cc b/src/fstbin/fstpropfinal.cc
index f469d7a93d1..d9a221f7805 100644
--- a/src/fstbin/fstpropfinal.cc
+++ b/src/fstbin/fstpropfinal.cc
@@ -1,6 +1,7 @@
 // fstbin/fstpropfinal.cc
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -25,13 +26,7 @@
 #include "fstext/fstext-utils.h"
 #include "fstext/kaldi-fst-io.h"
 
-#ifndef _MSC_VER
-#include <signal.h> // Comment this line and the call to signal below if
-// it causes compilation problems.  It is only to enable a debugging procedure
-// when determinization does not terminate.  
-#endif
-
-/* some test  examples.
+/* A test  example.
    You have to have the right things on your PATH for this to work.
 
 cat <<EOF | fstcompile | fstpropfinal 10 | fstprint
@@ -41,35 +36,9 @@ cat <<EOF | fstcompile | fstpropfinal 10 | fstprint
 2
 EOF
 
-
- ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
- ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
- ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
- # this last one fails [correctly]:
- ( echo "0 0 0 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint
-
-  cd ~/tmpdir
-  while true; do
-    fstrand > 1.fst
-    fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst > 2.fst
-    fstequivalent --random=true 1.fst 2.fst || echo "Test failed"
-    echo -n "."
-  done
-
- Test of debugging [with non-determinizable input]:
- ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo "2"; echo "1" ) | fstcompile | fstdeterminizestar
-  kill -SIGUSR1 [the process-id of fstdeterminizestar]
-  # prints out a bunch of debugging output showing the mess it got itself into.
 */
 
 
-bool debug_location = false;
-void signal_handler(int) {
-  debug_location = true;
-}
-
-
-
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -93,7 +62,7 @@ int main(int argc, char *argv[]) {
     std::string phi_str = po.GetOptArg(1),
         fst_in_str = po.GetOptArg(2),
         fst_out_str = po.GetOptArg(3);
-    
+
 
     int32 phi_label;
     if (!ConvertStringToInteger(phi_str, &phi_label)
@@ -104,9 +73,9 @@ int main(int argc, char *argv[]) {
 
 
     VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_str);
-    
+
     PropagateFinal(phi_label, fst);
-    
+
     WriteFstKaldi(*fst, fst_out_str);
     delete fst;
     return 0;
@@ -115,4 +84,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
index 4cfebcd0d51..ceca116c828 100644
--- a/src/hmm/hmm-test-utils.cc
+++ b/src/hmm/hmm-test-utils.cc
@@ -203,7 +203,7 @@ void GeneratePathThroughHmm(const HmmTopology &topology,
     const HmmTopology::HmmState &cur_hmm_state = this_entry[cur_state];
     int32 num_transitions = cur_hmm_state.transitions.size(),
         transition_index = RandInt(0, num_transitions - 1);
-    if (cur_hmm_state.pdf_class != -1) {
+    if (cur_hmm_state.forward_pdf_class != -1) {
       std::pair<int32, int32> pr(cur_state, transition_index);
       if (!reorder) {
         path->push_back(pr);
@@ -257,12 +257,15 @@ void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
           trans_model.GetTopo().TopologyForPhone(phone);
       int32 hmm_state = path[k].first,
           transition_index = path[k].second,
-          pdf_class = entry[hmm_state].pdf_class,
-          pdf_id;
-      bool ans = ctx_dep.Compute(context_window, pdf_class, &pdf_id);
+          forward_pdf_class = entry[hmm_state].forward_pdf_class,
+          self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class,
+          forward_pdf_id, self_loop_pdf_id;
+      bool ans = ctx_dep.Compute(context_window, forward_pdf_class, &forward_pdf_id);
       KALDI_ASSERT(ans && "context-dependency computation failed.");
-      int32 transition_state = trans_model.TripleToTransitionState(
-          phone, hmm_state, pdf_id),
+      ans = ctx_dep.Compute(context_window, self_loop_pdf_class, &self_loop_pdf_id);
+      KALDI_ASSERT(ans && "context-dependency computation failed.");
+      int32 transition_state = trans_model.TupleToTransitionState(
+                               phone, hmm_state, forward_pdf_id, self_loop_pdf_id),
           transition_id = trans_model.PairToTransitionId(transition_state,
                                                          transition_index);
       alignment->push_back(transition_id);
diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/hmm-topology-test.cc
index 61cf13e17bc..14081d2355d 100644
--- a/src/hmm/hmm-topology-test.cc
+++ b/src/hmm/hmm-topology-test.cc
@@ -58,6 +58,17 @@ void TestHmmTopology() {
       "  </TopologyEntry>\n"
       "  </Topology>\n";
 
+  std::string chain_input_str = "<Topology>\n"
+      "<TopologyEntry>\n"
+      "<ForPhones> 1 2 3 4 5 6 7 8 9 </ForPhones>\n"
+      " <State> 0 <ForwardPdfClass> 0 <SelfLoopPdfClass> 1\n"
+      "  <Transition> 0 0.5\n"
+      "  <Transition> 1 0.5\n"
+      " </State> \n"
+      " <State> 1 </State>\n"
+      "</TopologyEntry>\n"
+      "</Topology>\n";
+
   HmmTopology topo;
 
   if (RandInt(0, 1) == 0) {
@@ -84,6 +95,13 @@ void TestHmmTopology() {
     KALDI_ASSERT(oss1.str() == oss2.str());
   }
 
+  {  // test chain topology
+    HmmTopology chain_topo;
+    std::istringstream chain_iss(chain_input_str);
+    chain_topo.Read(chain_iss, false);
+    KALDI_ASSERT(chain_topo.MinLength(3) == 1);
+  }
+
   {  // make sure GetDefaultTopology does not crash.
     std::vector<int32> phones;
     phones.push_back(1);
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc
index 54144326766..cf134065dbf 100644
--- a/src/hmm/hmm-topology.cc
+++ b/src/hmm/hmm-topology.cc
@@ -76,12 +76,24 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             KALDI_ERR << "States are expected to be in order from zero, expected "
                       << this_entry.size() <<  ", got " << state;
           ReadToken(is, binary, &token);
-          int32 pdf_class = kNoPdf;  // -1 by default, means no pdf.
+          int32 forward_pdf_class = kNoPdf;  // -1 by default, means no pdf.
           if (token == "<PdfClass>") {
-            ReadBasicType(is, binary, &pdf_class);
+            ReadBasicType(is, binary, &forward_pdf_class);
+            this_entry.push_back(HmmState(forward_pdf_class));
             ReadToken(is, binary, &token);
-          }
-          this_entry.push_back(HmmState(pdf_class));
+            if (token == "<SelfLoopPdfClass>")
+              KALDI_ERR << "pdf classes should be defined using <PdfClass> "
+                        << "or <ForwardPdfClass>/<SelfLoopPdfClass> pair";
+          } else if (token == "<ForwardPdfClass>") {
+            int32 self_loop_pdf_class = kNoPdf;
+            ReadBasicType(is, binary, &forward_pdf_class);
+            ReadToken(is, binary, &token);
+            KALDI_ASSERT(token == "<SelfLoopPdfClass>");
+            ReadBasicType(is, binary, &self_loop_pdf_class);
+            this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class));
+            ReadToken(is, binary, &token);
+          } else
+            this_entry.push_back(HmmState(forward_pdf_class));
           while (token == "<Transition>") {
             int32 dst_state;
             BaseFloat trans_prob;
@@ -118,13 +130,22 @@ void HmmTopology::Read(std::istream &is, bool binary) {
     ReadIntegerVector(is, binary, &phone2idx_);
     int32 sz;
     ReadBasicType(is, binary, &sz);
+    bool is_hmm = true;
+    if (sz == -1) {
+      is_hmm = false;
+      ReadBasicType(is, binary, &sz);
+    }
     entries_.resize(sz);
     for (int32 i = 0; i < sz; i++) {
       int32 thist_sz;
       ReadBasicType(is, binary, &thist_sz);
       entries_[i].resize(thist_sz);
       for (int32 j = 0 ; j < thist_sz; j++) {
-        ReadBasicType(is, binary, &(entries_[i][j].pdf_class));
+        ReadBasicType(is, binary, &(entries_[i][j].forward_pdf_class));
+        if (is_hmm)
+          entries_[i][j].self_loop_pdf_class = entries_[i][j].forward_pdf_class;
+        else
+          ReadBasicType(is, binary, &(entries_[i][j].self_loop_pdf_class));
         int32 thiss_sz;
         ReadBasicType(is, binary, &thiss_sz);
         entries_[i][j].transitions.resize(thiss_sz);
@@ -141,6 +162,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
 
 
 void HmmTopology::Write(std::ostream &os, bool binary) const {
+  bool is_hmm = IsHmm();
   WriteToken(os, binary, "<Topology>");
   if (!binary) {  // Text-mode write.
     os << "\n";
@@ -159,9 +181,17 @@ void HmmTopology::Write(std::ostream &os, bool binary) const {
       for (size_t j = 0; j < entries_[i].size(); j++) {
         WriteToken(os, binary, "<State>");
         WriteBasicType(os, binary, static_cast<int32>(j));
-        if (entries_[i][j].pdf_class != kNoPdf) {
-          WriteToken(os, binary, "<PdfClass>");
-          WriteBasicType(os, binary, entries_[i][j].pdf_class);
+        if (entries_[i][j].forward_pdf_class != kNoPdf) {
+          if (is_hmm) {
+            WriteToken(os, binary, "<PdfClass>");
+            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
+          } else {
+            WriteToken(os, binary, "<ForwardPdfClass>");
+            WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
+            KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf);
+            WriteToken(os, binary, "<SelfLoopPdfClass>");
+            WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
+          }
         }
         for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
           WriteToken(os, binary, "<Transition>");
@@ -177,11 +207,15 @@ void HmmTopology::Write(std::ostream &os, bool binary) const {
   } else {
     WriteIntegerVector(os, binary, phones_);
     WriteIntegerVector(os, binary, phone2idx_);
+    // -1 is put here as a signal that the object has the new,
+    // extended format with SelfLoopPdfClass
+    if (!is_hmm) WriteBasicType(os, binary, static_cast<int32>(-1));
     WriteBasicType(os, binary, static_cast<int32>(entries_.size()));
     for (size_t i = 0; i < entries_.size(); i++) {
       WriteBasicType(os, binary, static_cast<int32>(entries_[i].size()));
       for (size_t j = 0; j < entries_[i].size(); j++) {
-        WriteBasicType(os, binary, entries_[i][j].pdf_class);
+        WriteBasicType(os, binary, entries_[i][j].forward_pdf_class);
+        if (!is_hmm) WriteBasicType(os, binary, entries_[i][j].self_loop_pdf_class);
         WriteBasicType(os, binary, static_cast<int32>(entries_[i][j].transitions.size()));
         for (size_t k = 0; k < entries_[i][j].transitions.size(); k++) {
           WriteBasicType(os, binary, entries_[i][j].transitions[k].first);
@@ -215,7 +249,7 @@ void HmmTopology::Check() {
     if (!entries_[i][num_states-1].transitions.empty())
       KALDI_ERR << "HmmTopology::Check(), last state must have no transitions.";
     // not sure how necessary this next stipulation is.
-    if (entries_[i][num_states-1].pdf_class != kNoPdf)
+    if (entries_[i][num_states-1].forward_pdf_class != kNoPdf)
       KALDI_ERR << "HmmTopology::Check(), last state must not be emitting.";
 
     std::vector<bool> has_trans_in(num_states, false);
@@ -223,8 +257,10 @@ void HmmTopology::Check() {
 
     for (int32 j = 0; j < num_states; j++) {  // j is the state-id.
       BaseFloat tot_prob = 0.0;
-      if (entries_[i][j].pdf_class != kNoPdf)
-        seen_pdf_classes.push_back(entries_[i][j].pdf_class);
+      if (entries_[i][j].forward_pdf_class != kNoPdf) {
+        seen_pdf_classes.push_back(entries_[i][j].forward_pdf_class);
+        seen_pdf_classes.push_back(entries_[i][j].self_loop_pdf_class);
+      }
       std::set<int32> seen_transition;
       for (int32 k = 0;
            static_cast<size_t>(k) < entries_[i][j].transitions.size();
@@ -238,7 +274,7 @@ void HmmTopology::Check() {
         // that are being built, which enable the creation of phone-level lattices
         // and rescoring these with a different lexicon and LM.
         if (dst_state == num_states-1 // && j != 0
-            && entries_[i][j].pdf_class == kNoPdf)
+            && entries_[i][j].forward_pdf_class == kNoPdf)
           KALDI_ERR << "We do not allow any state to be "
               "nonemitting and have a transition to the final-state (this would "
               "stop the SplitToPhones function from identifying the last state "
@@ -248,7 +284,8 @@ void HmmTopology::Check() {
         if (seen_transition.count(dst_state) != 0)
           KALDI_ERR << "HmmTopology::Check(), duplicate transition found.";
         if (dst_state == k) {  // self_loop...
-          KALDI_ASSERT(entries_[i][j].pdf_class != kNoPdf && "Nonemitting states cannot have self-loops.");
+          KALDI_ASSERT(entries_[i][j].self_loop_pdf_class != kNoPdf &&
+                       "Nonemitting states cannot have self-loops.");
         }
         seen_transition.insert(dst_state);
         has_trans_in[dst_state] = true;
@@ -275,6 +312,22 @@ void HmmTopology::Check() {
   }
 }
 
+bool HmmTopology::IsHmm() const {
+  const std::vector<int32> &phones = GetPhones();
+  KALDI_ASSERT(!phones.empty());
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    const TopologyEntry &entry = TopologyForPhone(phone);
+    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
+      int32 forward_pdf_class = entry[j].forward_pdf_class,
+            self_loop_pdf_class = entry[j].self_loop_pdf_class;
+      if (forward_pdf_class != self_loop_pdf_class)
+        return false;
+    }
+  }
+  return true;
+}
+
 const HmmTopology::TopologyEntry& HmmTopology::TopologyForPhone(int32 phone) const {  // Will throw if phone not covered.
   if (static_cast<size_t>(phone) >= phone2idx_.size() || phone2idx_[phone] == -1) {
     KALDI_ERR << "TopologyForPhone(), phone "<<(phone)<<" not covered.";
@@ -286,8 +339,10 @@ int32 HmmTopology::NumPdfClasses(int32 phone) const {
   // will throw if phone not covered.
   const TopologyEntry &entry = TopologyForPhone(phone);
   int32 max_pdf_class = 0;
-  for (size_t i = 0; i < entry.size(); i++)
-    max_pdf_class = std::max(max_pdf_class, entry[i].pdf_class);
+  for (size_t i = 0; i < entry.size(); i++) {
+    max_pdf_class = std::max(max_pdf_class, entry[i].forward_pdf_class);
+    max_pdf_class = std::max(max_pdf_class, entry[i].self_loop_pdf_class);
+  }
   return max_pdf_class+1;
 }
 
@@ -299,7 +354,7 @@ int32 HmmTopology::MinLength(int32 phone) const {
                                 std::numeric_limits<int32>::max());
   KALDI_ASSERT(!entry.empty());
 
-  min_length[0] = (entry[0].pdf_class == -1 ? 0 : 1);
+  min_length[0] = (entry[0].forward_pdf_class == -1 ? 0 : 1);
   int32 num_states = min_length.size();
   bool changed = true;
   while (changed) {
@@ -313,7 +368,7 @@ int32 HmmTopology::MinLength(int32 phone) const {
         int32 next_state = iter->first;
         KALDI_ASSERT(next_state < num_states);
         int32 next_state_min_length = min_length[s] +
-            (entry[next_state].pdf_class == -1 ? 0 : 1);
+            (entry[next_state].forward_pdf_class == -1 ? 0 : 1);
         if (next_state_min_length < min_length[next_state]) {
           min_length[next_state] = next_state_min_length;
           if (next_state < s)
diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h
index 79b535e7d6b..edea02998c0 100644
--- a/src/hmm/hmm-topology.h
+++ b/src/hmm/hmm-topology.h
@@ -95,23 +95,38 @@ class HmmTopology {
  public:
   /// A structure defined inside HmmTopology to represent a HMM state.
   struct HmmState {
-    /// The \ref pdf_class pdf-class, typically 0, 1 or 2 (the same as the HMM-state index),
+    /// The \ref pdf_class forward-pdf-class, typically 0, 1 or 2 (the same as the HMM-state index),
     /// but may be different to enable us to hardwire sharing of state, and may be
     /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual).
-    int32 pdf_class;
+    int32 forward_pdf_class;
+
+    /// The \ref pdf_class self-loop pdf-class, similar to \ref pdf_class forward-pdf-class.
+    /// They will either both be \ref kNoPdf, or neither be \ref kNoPdf.
+    int32 self_loop_pdf_class;
 
     /// A list of transitions, indexed by what we call a 'transition-index'.
     /// The first member of each pair is the index of the next HmmState, and the
     /// second is the default transition probability (before training).
     std::vector<std::pair<int32, BaseFloat> > transitions;
 
-    explicit HmmState(int32 p): pdf_class(p) { }
+    explicit HmmState(int32 pdf_class) {
+      this->forward_pdf_class = pdf_class;
+      this->self_loop_pdf_class = pdf_class;
+    }
+    explicit HmmState(int32 forward_pdf_class, int32 self_loop_pdf_class) {
+      KALDI_ASSERT((forward_pdf_class != kNoPdf && self_loop_pdf_class != kNoPdf) ||
+                   (forward_pdf_class == kNoPdf && self_loop_pdf_class == kNoPdf));
+      this->forward_pdf_class = forward_pdf_class;
+      this->self_loop_pdf_class = self_loop_pdf_class;
+    }
 
     bool operator == (const HmmState &other) const {
-      return (pdf_class == other.pdf_class && transitions == other.transitions);
+      return (forward_pdf_class == other.forward_pdf_class &&
+              self_loop_pdf_class == other.self_loop_pdf_class &&
+              transitions == other.transitions);
     }
 
-    HmmState(): pdf_class(-1) { }
+    HmmState(): forward_pdf_class(-1), self_loop_pdf_class(-1) { }
   };
 
   /// TopologyEntry is a typedef that represents the topology of
@@ -124,6 +139,15 @@ class HmmTopology {
   // Checks that the object is valid, and throw exception otherwise.
   void Check();
 
+  /// Returns true if this HmmTopology is really 'hmm-like', i.e. the pdf-class on
+  /// the self-loops and forward transitions of all states are identical. [note: in HMMs,
+  /// the densities are associated with the states.] We have extended this to
+  /// support 'non-hmm-like' topologies (where those pdf-classes are different),
+  /// in order to make for more compact decoding graphs in our so-called 'chain models'
+  /// (AKA lattice-free MMI), where we use 1-state topologies that have different pdf-classes
+  /// for the self-loop and the forward transition. Note that we always use the 'reorder=true'
+  /// option so the 'forward transition' actually comes before the self-loop.
+  bool IsHmm() const;
 
   /// Returns the topology entry (i.e. vector of HmmState) for this phone;
   /// will throw exception if phone not covered by the topology.
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 04ec09d14b7..ab0b133f708 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -93,11 +93,16 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
   for (int32 hmm_state = 0;
        hmm_state < static_cast<int32>(entry.size());
        hmm_state++) {
-    int32 pdf_class = entry[hmm_state].pdf_class, pdf;
-    if (pdf_class == kNoPdf) pdf = kNoPdf;  // nonemitting state.
-    else {
-      KALDI_ASSERT(pdf_class < static_cast<int32>(pdfs.size()));
-      pdf = pdfs[pdf_class];
+    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
+    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
+    if (forward_pdf_class == kNoPdf) {  // nonemitting state.
+      forward_pdf = kNoPdf;
+      self_loop_pdf = kNoPdf;
+    } else {
+      KALDI_ASSERT(forward_pdf_class < static_cast<int32>(pdfs.size()));
+      KALDI_ASSERT(self_loop_pdf_class < static_cast<int32>(pdfs.size()));
+      forward_pdf = pdfs[forward_pdf_class];
+      self_loop_pdf = pdfs[self_loop_pdf_class];
     }
     int32 trans_idx;
     for (trans_idx = 0;
@@ -110,7 +115,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
       if (is_self_loop)
         continue; // We will add self-loops in at a later stage of processing,
       // not in this function.
-      if (pdf_class == kNoPdf) {
+      if (forward_pdf_class == kNoPdf) {
         // no pdf, hence non-estimated probability.
         // [would not happen with normal topology] .  There is no transition-state
         // involved in this case.
@@ -118,7 +123,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
         label = 0;
       } else {  // normal probability.
         int32 trans_state =
-            trans_model.TripleToTransitionState(phone, hmm_state, pdf);
+            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
         int32 trans_id =
             trans_model.PairToTransitionId(trans_state, trans_idx);
         log_prob = trans_model.GetTransitionLogProbIgnoringSelfLoops(trans_id);
@@ -183,10 +188,15 @@ GetHmmAsFstSimple(std::vector<int32> phone_window,
   for (int32 hmm_state = 0;
        hmm_state < static_cast<int32>(entry.size());
        hmm_state++) {
-    int32 pdf_class = entry[hmm_state].pdf_class, pdf;
-    if (pdf_class == kNoPdf) pdf = kNoPdf;  // nonemitting state; not generally used.
-    else {
-      bool ans = ctx_dep.Compute(phone_window, pdf_class, &pdf);
+    int32 forward_pdf_class = entry[hmm_state].forward_pdf_class, forward_pdf;
+    int32 self_loop_pdf_class = entry[hmm_state].self_loop_pdf_class, self_loop_pdf;
+    if (forward_pdf_class == kNoPdf) {   // nonemitting state; not generally used.
+      forward_pdf = kNoPdf;
+      self_loop_pdf = kNoPdf;
+    } else {
+      bool ans = ctx_dep.Compute(phone_window, forward_pdf_class, &forward_pdf);
+      KALDI_ASSERT(ans && "Context-dependency computation failed.");
+      ans = ctx_dep.Compute(phone_window, self_loop_pdf_class, &self_loop_pdf);
       KALDI_ASSERT(ans && "Context-dependency computation failed.");
     }
     int32 trans_idx;
@@ -196,7 +206,7 @@ GetHmmAsFstSimple(std::vector<int32> phone_window,
       BaseFloat log_prob;
       Label label;
       int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      if (pdf_class == kNoPdf) {
+      if (forward_pdf_class == kNoPdf) {
         // no pdf, hence non-estimated probability.  very unusual case.  [would
         // not happen with normal topology] .  There is no transition-state
         // involved in this case.
@@ -205,7 +215,7 @@ GetHmmAsFstSimple(std::vector<int32> phone_window,
         label = 0;
       } else {  // normal probability.
         int32 trans_state =
-            trans_model.TripleToTransitionState(phone, hmm_state, pdf);
+            trans_model.TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf);
         int32 trans_id =
             trans_model.PairToTransitionId(trans_state, trans_idx);
         log_prob = prob_scale * trans_model.GetTransitionLogProb(trans_id);
@@ -652,8 +662,8 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model,
     int32 trans_state =
       trans_model.TransitionIdToTransitionState(alignment[cur_point]);
     int32 phone = trans_model.TransitionStateToPhone(trans_state);
-    int32 pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].pdf_class;
-    if (pdf_class != kNoPdf)  // initial-state of the current phone is emitting
+    int32 forward_pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].forward_pdf_class;
+    if (forward_pdf_class != kNoPdf)  // initial-state of the current phone is emitting
       if (trans_model.TransitionStateToHmmState(trans_state) != 0)
         was_ok = false;
     for (size_t j = cur_point; j < end_points[i]; j++)
@@ -739,14 +749,19 @@ static inline void ConvertAlignmentForPhone(
   // the topologies and lengths match -> we can directly transfer
   // the alignment.
   for (int32 j = 0; j < alignment_size; j++) {
-    int32 old_tid = old_phone_alignment[j];
-    int32 pdf_class = old_trans_model.TransitionIdToPdfClass(old_tid);
+    int32 old_tid = old_phone_alignment[j],
+        old_tstate = old_trans_model.TransitionIdToTransitionState(old_tid);
+    int32 forward_pdf_class =
+        old_trans_model.TransitionStateToForwardPdfClass(old_tstate),
+        self_loop_pdf_class =
+        old_trans_model.TransitionStateToSelfLoopPdfClass(old_tstate);
     int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid);
     int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid);
-    int32 new_pdf = pdf_ids[pdf_class];
+    int32 new_forward_pdf = pdf_ids[forward_pdf_class];
+    int32 new_self_loop_pdf = pdf_ids[self_loop_pdf_class];
     int32 new_trans_state =
-        new_trans_model.TripleToTransitionState(new_central_phone, hmm_state,
-                                                new_pdf);
+        new_trans_model.TupleToTransitionState(new_central_phone, hmm_state,
+                                               new_forward_pdf, new_self_loop_pdf);
     int32 new_tid =
         new_trans_model.PairToTransitionId(new_trans_state, trans_idx);
     (*new_phone_alignment)[j] = new_tid;
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
index df22169cd25..83edbaf5805 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transition-model.cc
@@ -24,13 +24,26 @@
 
 namespace kaldi {
 
-void TransitionModel::ComputeTriples(const ContextDependencyInterface &ctx_dep) {
+void TransitionModel::ComputeTuples(const ContextDependencyInterface &ctx_dep) {
+  if (IsHmm())
+    ComputeTuplesIsHmm(ctx_dep);
+  else
+    ComputeTuplesNotHmm(ctx_dep);
+
+  // now tuples_ is populated with all possible tuples of (phone, hmm_state, pdf, self_loop_pdf).
+  std::sort(tuples_.begin(), tuples_.end());  // sort to enable reverse lookup.
+  // this sorting defines the transition-ids.
+}
+
+void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
-  std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
   KALDI_ASSERT(!phones.empty());
+
+  // this is the case for normal models. but not fot chain models
+  std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
   std::vector<int32> num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1);
   for (size_t i = 0; i < phones.size(); i++)
-   num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]);
+    num_pdf_classes[phones[i]] = topo_.NumPdfClasses(phones[i]);
   ctx_dep.GetPdfInfo(phones, num_pdf_classes, &pdf_info);
   // pdf_info is list indexed by pdf of which (phone, pdf_class) it
   // can correspond to.
@@ -43,47 +56,108 @@ void TransitionModel::ComputeTriples(const ContextDependencyInterface &ctx_dep)
     int32 phone = phones[i];
     const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
     for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
-      int32 pdf_class = entry[j].pdf_class;
+      int32 pdf_class = entry[j].forward_pdf_class;
       if (pdf_class != kNoPdf) {
         to_hmm_state_list[std::make_pair(phone, pdf_class)].push_back(j);
       }
     }
   }
+
   for (int32 pdf = 0; pdf < static_cast<int32>(pdf_info.size()); pdf++) {
     for (size_t j = 0; j < pdf_info[pdf].size(); j++) {
       int32 phone = pdf_info[pdf][j].first,
-          pdf_class = pdf_info[pdf][j].second;
+            pdf_class = pdf_info[pdf][j].second;
       const std::vector<int32> &state_vec = to_hmm_state_list[std::make_pair(phone, pdf_class)];
       KALDI_ASSERT(!state_vec.empty());
       // state_vec is a list of the possible HMM-states that emit this
       // pdf_class.
       for (size_t k = 0; k < state_vec.size(); k++) {
         int32 hmm_state = state_vec[k];
-        triples_.push_back(Triple(phone, hmm_state, pdf));
+        tuples_.push_back(Tuple(phone, hmm_state, pdf, pdf));
       }
     }
   }
+}
 
-  // now triples_ is populated with all possible triples of (phone, hmm_state, pdf).
-  std::sort(triples_.begin(), triples_.end());  // sort to enable reverse lookup.
-  // this sorting defines the transition-ids.
+void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep) {
+  const std::vector<int32> &phones = topo_.GetPhones();
+  KALDI_ASSERT(!phones.empty());
+
+  // pdf_info is a set of lists indexed by phone. Each list is indexed by
+  // (pdf-class, self-loop pdf-class) of each state of that phone, and the element
+  // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
+  // pair generates.
+  std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
+  // pdf_class_pairs is a set of lists indexed by phone. Each list stores
+  // (pdf-class, self-loop pdf-class) of each state of that phone.
+  std::vector<std::vector<std::pair<int32, int32> > > pdf_class_pairs;
+  pdf_class_pairs.resize(1 + *std::max_element(phones.begin(), phones.end()));
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
+      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
+      if (forward_pdf_class != kNoPdf)
+        pdf_class_pairs[phone].push_back(std::make_pair(forward_pdf_class, self_loop_pdf_class));
+    }
+  }
+  ctx_dep.GetPdfInfo(phones, pdf_class_pairs, &pdf_info);
+
+  std::vector<std::map<std::pair<int32, int32>, std::vector<int32> > > to_hmm_state_list;
+  to_hmm_state_list.resize(1 + *std::max_element(phones.begin(), phones.end()));
+  // to_hmm_state_list is a phone-indexed set of maps from (pdf-class, self-loop pdf_class) to the list
+  // of hmm-states in the HMM for that phone that that (pdf-class, self-loop pdf-class)
+  // can correspond to.
+  for (size_t i = 0; i < phones.size(); i++) {  // setting up to_hmm_state_list.
+    int32 phone = phones[i];
+    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    std::map<std::pair<int32, int32>, std::vector<int32> > phone_to_hmm_state_list;
+    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
+      int32 forward_pdf_class = entry[j].forward_pdf_class, self_loop_pdf_class = entry[j].self_loop_pdf_class;
+      if (forward_pdf_class != kNoPdf) {
+        phone_to_hmm_state_list[std::make_pair(forward_pdf_class, self_loop_pdf_class)].push_back(j);
+      }
+    }
+    to_hmm_state_list[phone] = phone_to_hmm_state_list;
+  }
+
+  for (int32 i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    for (int32 j = 0; j < static_cast<int32>(pdf_info[phone].size()); j++) {
+      int32 pdf_class = pdf_class_pairs[phone][j].first,
+            self_loop_pdf_class = pdf_class_pairs[phone][j].second;
+      const std::vector<int32> &state_vec =
+              to_hmm_state_list[phone][std::make_pair(pdf_class, self_loop_pdf_class)];
+      KALDI_ASSERT(!state_vec.empty());
+      for (size_t k = 0; k < state_vec.size(); k++) {
+        int32 hmm_state = state_vec[k];
+        for (size_t m = 0; m < pdf_info[phone][j].size(); m++) {
+          int32 pdf = pdf_info[phone][j][m].first,
+            self_loop_pdf = pdf_info[phone][j][m].second;
+          tuples_.push_back(Tuple(phone, hmm_state, pdf, self_loop_pdf));
+        }
+      }
+    }
+  }
 }
 
 void TransitionModel::ComputeDerived() {
-  state2id_.resize(triples_.size()+2);  // indexed by transition-state, which
+  state2id_.resize(tuples_.size()+2);  // indexed by transition-state, which
   // is one based, but also an entry for one past end of list.
 
   int32 cur_transition_id = 1;
   num_pdfs_ = 0;
   for (int32 tstate = 1;
-      tstate <= static_cast<int32>(triples_.size()+1);  // not a typo.
+      tstate <= static_cast<int32>(tuples_.size()+1);  // not a typo.
       tstate++) {
     state2id_[tstate] = cur_transition_id;
-    if (static_cast<size_t>(tstate) <= triples_.size()) {
-      int32 phone = triples_[tstate-1].phone,
-          hmm_state = triples_[tstate-1].hmm_state,
-          pdf = triples_[tstate-1].pdf;
-      num_pdfs_ = std::max(num_pdfs_, 1+pdf);
+    if (static_cast<size_t>(tstate) <= tuples_.size()) {
+      int32 phone = tuples_[tstate-1].phone,
+          hmm_state = tuples_[tstate-1].hmm_state,
+          forward_pdf = tuples_[tstate-1].forward_pdf,
+          self_loop_pdf = tuples_[tstate-1].self_loop_pdf;
+      num_pdfs_ = std::max(num_pdfs_, 1 + forward_pdf);
+      num_pdfs_ = std::max(num_pdfs_, 1 + self_loop_pdf);
       const HmmTopology::HmmState &state = topo_.TopologyForPhone(phone)[hmm_state];
       int32 my_num_ids = static_cast<int32>(state.transitions.size());
       cur_transition_id += my_num_ids;  // # trans out of this state.
@@ -91,20 +165,26 @@ void TransitionModel::ComputeDerived() {
   }
 
   id2state_.resize(cur_transition_id);   // cur_transition_id is #transition-ids+1.
-  for (int32 tstate = 1; tstate <= static_cast<int32>(triples_.size()); tstate++)
-    for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++)
+  id2pdf_id_.resize(cur_transition_id);
+  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++)
+    for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) {
       id2state_[tid] = tstate;
-
+      if (IsSelfLoop(tid))
+        id2pdf_id_[tid] = tuples_[tstate-1].self_loop_pdf;
+      else
+        id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf;
+    }
 }
+
 void TransitionModel::InitializeProbs() {
   log_probs_.Resize(NumTransitionIds()+1);  // one-based array, zeroth element empty.
   for (int32 trans_id = 1; trans_id <= NumTransitionIds(); trans_id++) {
     int32 trans_state = id2state_[trans_id];
     int32 trans_index = trans_id - state2id_[trans_state];
-    const Triple &triple = triples_[trans_state-1];
-    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone);
-    KALDI_ASSERT(static_cast<size_t>(triple.hmm_state) < entry.size());
-    BaseFloat prob = entry[triple.hmm_state].transitions[trans_index].second;
+    const Tuple &tuple = tuples_[trans_state-1];
+    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
+    KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
+    BaseFloat prob = entry[tuple.hmm_state].transitions[trans_index].second;
     if (prob <= 0.0)
       KALDI_ERR << "TransitionModel::InitializeProbs, zero "
           "probability [should remove that entry in the topology]";
@@ -129,40 +209,55 @@ void TransitionModel::Check() const {
     KALDI_ASSERT(tid == PairToTransitionId(tstate, index));
     int32 phone = TransitionStateToPhone(tstate),
         hmm_state = TransitionStateToHmmState(tstate),
-        pdf = TransitionStateToPdf(tstate);
-    KALDI_ASSERT(tstate == TripleToTransitionState(phone, hmm_state, pdf));
+        forward_pdf = TransitionStateToForwardPdf(tstate),
+        self_loop_pdf = TransitionStateToSelfLoopPdf(tstate);
+    KALDI_ASSERT(tstate == TupleToTransitionState(phone, hmm_state, forward_pdf, self_loop_pdf));
     KALDI_ASSERT(log_probs_(tid) <= 0.0 && log_probs_(tid) - log_probs_(tid) == 0.0);
     // checking finite and non-positive (and not out-of-bounds).
   }
 }
 
+bool TransitionModel::IsHmm() const {
+  const std::vector<int32> &phones = topo_.GetPhones();
+  KALDI_ASSERT(!phones.empty());
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+    for (int32 j = 0; j < static_cast<int32>(entry.size()); j++) {  // for each state...
+      if (entry[j].forward_pdf_class != entry[j].self_loop_pdf_class)
+        return false;
+    }
+  }
+  return true;
+}
+
 TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep,
                                  const HmmTopology &hmm_topo): topo_(hmm_topo) {
-  // First thing is to get all possible triples.
-  ComputeTriples(ctx_dep);
+  // First thing is to get all possible tuples.
+  ComputeTuples(ctx_dep);
   ComputeDerived();
   InitializeProbs();
   Check();
 }
 
-int32 TransitionModel::TripleToTransitionState(int32 phone, int32 hmm_state, int32 pdf) const {
-  Triple triple(phone, hmm_state, pdf);
+int32 TransitionModel::TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const {
+  Tuple tuple(phone, hmm_state, pdf, self_loop_pdf);
   // Note: if this ever gets too expensive, which is unlikely, we can refactor
   // this code to sort first on pdf, and then index on pdf, so those
   // that have the same pdf are in a contiguous range.
-  std::vector<Triple>::const_iterator iter =
-      std::lower_bound(triples_.begin(), triples_.end(), triple);
-  if (iter == triples_.end() || !(*iter == triple)) {
-    KALDI_ERR << "TransitionModel::TripleToTransitionState, triple not found."
+  std::vector<Tuple>::const_iterator iter =
+      std::lower_bound(tuples_.begin(), tuples_.end(), tuple);
+  if (iter == tuples_.end() || !(*iter == tuple)) {
+    KALDI_ERR << "TransitionModel::TupleToTransitionState, tuple not found."
               << " (incompatible tree and model?)";
   }
-  // triples_ is indexed by transition_state-1, so add one.
-  return static_cast<int32>((iter - triples_.begin())) + 1;
+  // tuples_ is indexed by transition_state-1, so add one.
+  return static_cast<int32>((iter - tuples_.begin())) + 1;
 }
 
 
 int32 TransitionModel::NumTransitionIndices(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= triples_.size());
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   return static_cast<int32>(state2id_[trans_state+1]-state2id_[trans_state]);
 }
 
@@ -177,32 +272,57 @@ int32 TransitionModel::TransitionIdToTransitionIndex(int32 trans_id) const {
 }
 
 int32 TransitionModel::TransitionStateToPhone(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= triples_.size());
-  return triples_[trans_state-1].phone;
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  return tuples_[trans_state-1].phone;
 }
 
-int32 TransitionModel::TransitionStateToPdf(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= triples_.size());
-  return triples_[trans_state-1].pdf;
+int32 TransitionModel::TransitionStateToForwardPdf(int32 trans_state) const {
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  return tuples_[trans_state-1].forward_pdf;
+}
+
+int32 TransitionModel::TransitionStateToForwardPdfClass(
+    int32 trans_state) const {
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  const Tuple &t = tuples_[trans_state-1];
+  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
+  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
+  return entry[t.hmm_state].forward_pdf_class;
+}
+
+
+int32 TransitionModel::TransitionStateToSelfLoopPdfClass(
+    int32 trans_state) const {
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  const Tuple &t = tuples_[trans_state-1];
+  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
+  KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
+  return entry[t.hmm_state].self_loop_pdf_class;
+}
+
+
+int32 TransitionModel::TransitionStateToSelfLoopPdf(int32 trans_state) const {
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  return tuples_[trans_state-1].self_loop_pdf;
 }
 
 int32 TransitionModel::TransitionStateToHmmState(int32 trans_state) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= triples_.size());
-  return triples_[trans_state-1].hmm_state;
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
+  return tuples_[trans_state-1].hmm_state;
 }
 
 int32 TransitionModel::PairToTransitionId(int32 trans_state, int32 trans_index) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_state) <= triples_.size());
+  KALDI_ASSERT(static_cast<size_t>(trans_state) <= tuples_.size());
   KALDI_ASSERT(trans_index < state2id_[trans_state+1] - state2id_[trans_state]);
   return state2id_[trans_state] + trans_index;
 }
 
 int32 TransitionModel::NumPhones() const {
-  int32 num_trans_state = triples_.size();
+  int32 num_trans_state = tuples_.size();
   int32 max_phone_id = 0;
   for (int32 i = 0; i < num_trans_state; ++i) {
-    if (triples_[i].phone > max_phone_id)
-      max_phone_id = triples_[i].phone;
+    if (tuples_[i].phone > max_phone_id)
+      max_phone_id = tuples_[i].phone;
   }
   return max_phone_id;
 }
@@ -212,36 +332,25 @@ bool TransitionModel::IsFinal(int32 trans_id) const {
   KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
   int32 trans_index = trans_id - state2id_[trans_state];
-  const Triple &triple = triples_[trans_state-1];
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone);
-  KALDI_ASSERT(static_cast<size_t>(triple.hmm_state) < entry.size());
-  KALDI_ASSERT(static_cast<size_t>(triple.hmm_state) < entry.size());
+  const Tuple &tuple = tuples_[trans_state-1];
+  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
+  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
+  KALDI_ASSERT(static_cast<size_t>(tuple.hmm_state) < entry.size());
   KALDI_ASSERT(static_cast<size_t>(trans_index) <
-               entry[triple.hmm_state].transitions.size());
+               entry[tuple.hmm_state].transitions.size());
   // return true if the transition goes to the final state of the
   // topology entry.
-  return (entry[triple.hmm_state].transitions[trans_index].first + 1 ==
+  return (entry[tuple.hmm_state].transitions[trans_index].first + 1 ==
           static_cast<int32>(entry.size()));
 }
 
 
-bool TransitionModel::IsSelfLoop(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
-  int32 trans_state = id2state_[trans_id];
-  int32 trans_index = trans_id - state2id_[trans_state];
-  const Triple &triple = triples_[trans_state-1];
-  int32 phone = triple.phone, hmm_state = triple.hmm_state;
-  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
-  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
-  return (static_cast<size_t>(trans_index) < entry[hmm_state].transitions.size()
-          && entry[hmm_state].transitions[trans_index].first == hmm_state);
-}
 
 int32 TransitionModel::SelfLoopOf(int32 trans_state) const {  // returns the self-loop transition-id,
-  KALDI_ASSERT(static_cast<size_t>(trans_state-1) < triples_.size());
-  const Triple &triple = triples_[trans_state-1];
+  KALDI_ASSERT(static_cast<size_t>(trans_state-1) < tuples_.size());
+  const Tuple &tuple = tuples_[trans_state-1];
   // or zero if does not exist.
-  int32 phone = triple.phone, hmm_state = triple.hmm_state;
+  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
   const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
   KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
   for (int32 trans_index = 0;
@@ -274,16 +383,22 @@ void TransitionModel::ComputeDerivedOfProbs() {
 void TransitionModel::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<TransitionModel>");
   topo_.Read(is, binary);
-  ExpectToken(is, binary, "<Triples>");
+  std::string token;
+  ReadToken(is, binary, &token);
   int32 size;
   ReadBasicType(is, binary, &size);
-  triples_.resize(size);
+  tuples_.resize(size);
   for (int32 i = 0; i < size; i++) {
-    ReadBasicType(is, binary, &(triples_[i].phone));
-    ReadBasicType(is, binary, &(triples_[i].hmm_state));
-    ReadBasicType(is, binary, &(triples_[i].pdf));
+    ReadBasicType(is, binary, &(tuples_[i].phone));
+    ReadBasicType(is, binary, &(tuples_[i].hmm_state));
+    ReadBasicType(is, binary, &(tuples_[i].forward_pdf));
+    if (token == "<Tuples>")
+      ReadBasicType(is, binary, &(tuples_[i].self_loop_pdf));
+    else if (token == "<Triples>")
+      tuples_[i].self_loop_pdf = tuples_[i].forward_pdf;
   }
-  ExpectToken(is, binary, "</Triples>");
+  ReadToken(is, binary, &token);
+  KALDI_ASSERT(token == "</Triples>" || token == "</Tuples>");
   ComputeDerived();
   ExpectToken(is, binary, "<LogProbs>");
   log_probs_.Read(is, binary);
@@ -294,19 +409,28 @@ void TransitionModel::Read(std::istream &is, bool binary) {
 }
 
 void TransitionModel::Write(std::ostream &os, bool binary) const {
+  bool is_hmm = IsHmm();
   WriteToken(os, binary, "<TransitionModel>");
   if (!binary) os << "\n";
   topo_.Write(os, binary);
-  WriteToken(os, binary, "<Triples>");
-  WriteBasicType(os, binary, static_cast<int32>(triples_.size()));
+  if (is_hmm)
+    WriteToken(os, binary, "<Triples>");
+  else
+    WriteToken(os, binary, "<Tuples>");
+  WriteBasicType(os, binary, static_cast<int32>(tuples_.size()));
   if (!binary) os << "\n";
-  for (int32 i = 0; i < static_cast<int32> (triples_.size()); i++) {
-    WriteBasicType(os, binary, triples_[i].phone);
-    WriteBasicType(os, binary, triples_[i].hmm_state);
-    WriteBasicType(os, binary, triples_[i].pdf);
+  for (int32 i = 0; i < static_cast<int32> (tuples_.size()); i++) {
+    WriteBasicType(os, binary, tuples_[i].phone);
+    WriteBasicType(os, binary, tuples_[i].hmm_state);
+    WriteBasicType(os, binary, tuples_[i].forward_pdf);
+    if (!is_hmm)
+      WriteBasicType(os, binary, tuples_[i].self_loop_pdf);
     if (!binary) os << "\n";
   }
-  WriteToken(os, binary, "</Triples>");
+  if (is_hmm)
+    WriteToken(os, binary, "</Triples>");
+  else
+    WriteToken(os, binary, "</Tuples>");
   if (!binary) os << "\n";
   WriteToken(os, binary, "<LogProbs>");
   if (!binary) os << "\n";
@@ -473,8 +597,12 @@ void TransitionModel::MleUpdateShared(const Vector<double> &stats,
   std::map<int32, std::set<int32> > pdf_to_tstate;
 
   for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToPdf(tstate);
+    int32 pdf = TransitionStateToForwardPdf(tstate);
     pdf_to_tstate[pdf].insert(tstate);
+    if (!IsHmm()) {
+      pdf = TransitionStateToSelfLoopPdf(tstate);
+      pdf_to_tstate[pdf].insert(tstate);
+    }
   }
   std::map<int32, std::set<int32> >::iterator map_iter;
   for (map_iter = pdf_to_tstate.begin();
@@ -567,8 +695,12 @@ void TransitionModel::MapUpdateShared(const Vector<double> &stats,
   std::map<int32, std::set<int32> > pdf_to_tstate;
 
   for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    int32 pdf = TransitionStateToPdf(tstate);
+    int32 pdf = TransitionStateToForwardPdf(tstate);
     pdf_to_tstate[pdf].insert(tstate);
+    if (!IsHmm()) {
+      pdf = TransitionStateToSelfLoopPdf(tstate);
+      pdf_to_tstate[pdf].insert(tstate);
+    }
   }
   std::map<int32, std::set<int32> >::iterator map_iter;
   for (map_iter = pdf_to_tstate.begin();
@@ -642,24 +774,27 @@ void TransitionModel::MapUpdateShared(const Vector<double> &stats,
 int32 TransitionModel::TransitionIdToPhone(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
-  return triples_[trans_state-1].phone;
+  return tuples_[trans_state-1].phone;
 }
 
 int32 TransitionModel::TransitionIdToPdfClass(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
 
-  const Triple &t = triples_[trans_state-1];
+  const Tuple &t = tuples_[trans_state-1];
   const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(t.phone);
   KALDI_ASSERT(static_cast<size_t>(t.hmm_state) < entry.size());
-  return entry[t.hmm_state].pdf_class;
+  if (IsSelfLoop(trans_id))
+    return entry[t.hmm_state].self_loop_pdf_class;
+  else
+    return entry[t.hmm_state].forward_pdf_class;
 }
 
 
 int32 TransitionModel::TransitionIdToHmmState(int32 trans_id) const {
   KALDI_ASSERT(trans_id != 0 && static_cast<size_t>(trans_id) < id2state_.size());
   int32 trans_state = id2state_[trans_id];
-  const Triple &t = triples_[trans_state-1];
+  const Tuple &t = tuples_[trans_state-1];
   return t.hmm_state;
 }
 
@@ -668,23 +803,34 @@ void TransitionModel::Print(std::ostream &os,
                             const Vector<double> *occs) {
   if (occs != NULL)
     KALDI_ASSERT(occs->Dim() == NumPdfs());
+  bool is_hmm = IsHmm();
   for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) {
-    const Triple &triple = triples_[tstate-1];
-    KALDI_ASSERT(static_cast<size_t>(triple.phone) < phone_names.size());
-    std::string phone_name = phone_names[triple.phone];
+    const Tuple &tuple = tuples_[tstate-1];
+    KALDI_ASSERT(static_cast<size_t>(tuple.phone) < phone_names.size());
+    std::string phone_name = phone_names[tuple.phone];
 
     os << "Transition-state " << tstate << ": phone = " << phone_name
-       << " hmm-state = " << triple.hmm_state << " pdf = " << triple.pdf << '\n';
+       << " hmm-state = " << tuple.hmm_state;
+    if (is_hmm)
+      os << " pdf = " << tuple.forward_pdf << '\n';
+    else
+      os << " forward-pdf = " << tuple.forward_pdf << " self-loop-pdf = "
+         << tuple.self_loop_pdf << '\n';
     for (int32 tidx = 0; tidx < NumTransitionIndices(tstate); tidx++) {
       int32 tid = PairToTransitionId(tstate, tidx);
       BaseFloat p = GetTransitionProb(tid);
       os << " Transition-id = " << tid << " p = " << p;
-      if (occs != NULL) os << " count of pdf = " << (*occs)(triple.pdf);
+      if (occs != NULL) {
+        if (IsSelfLoop(tid))
+          os << " count of pdf = " << (*occs)(tuple.self_loop_pdf);
+        else
+          os << " count of pdf = " << (*occs)(tuple.forward_pdf);
+      }
       // now describe what it's a transition to.
       if (IsSelfLoop(tid)) os << " [self-loop]\n";
       else {
-        int32 hmm_state = triple.hmm_state;
-        const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(triple.phone);
+        int32 hmm_state = tuple.hmm_state;
+        const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(tuple.phone);
         KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
         int32 next_hmm_state = entry[hmm_state].transitions[tidx].first;
         KALDI_ASSERT(next_hmm_state != hmm_state);
@@ -702,14 +848,18 @@ bool GetPdfsForPhones(const TransitionModel &trans_model,
   pdfs->clear();
   for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
     if (std::binary_search(phones.begin(), phones.end(),
-                          trans_model.TransitionStateToPhone(tstate)))
-      pdfs->push_back(trans_model.TransitionStateToPdf(tstate));
+             trans_model.TransitionStateToPhone(tstate))) {
+      pdfs->push_back(trans_model.TransitionStateToForwardPdf(tstate));
+      pdfs->push_back(trans_model.TransitionStateToSelfLoopPdf(tstate));
+    }
   }
   SortAndUniq(pdfs);
 
   for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
-    if (std::binary_search(pdfs->begin(), pdfs->end(),
-                          trans_model.TransitionStateToPdf(tstate))
+    if ((std::binary_search(pdfs->begin(), pdfs->end(),
+                          trans_model.TransitionStateToForwardPdf(tstate)) ||
+         std::binary_search(pdfs->begin(), pdfs->end(),
+                          trans_model.TransitionStateToSelfLoopPdf(tstate)))
        && !std::binary_search(phones.begin(), phones.end(),
                               trans_model.TransitionStateToPhone(tstate)))
       return false;
@@ -724,7 +874,9 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model,
   phones->clear();
   for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++) {
     if (std::binary_search(pdfs.begin(), pdfs.end(),
-                           trans_model.TransitionStateToPdf(tstate)))
+                           trans_model.TransitionStateToForwardPdf(tstate)) ||
+        std::binary_search(pdfs.begin(), pdfs.end(),
+                           trans_model.TransitionStateToSelfLoopPdf(tstate)))
       phones->push_back(trans_model.TransitionStateToPhone(tstate));
   }
   SortAndUniq(phones);
@@ -732,16 +884,30 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model,
   for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
     if (std::binary_search(phones->begin(), phones->end(),
                            trans_model.TransitionStateToPhone(tstate))
-        && !std::binary_search(pdfs.begin(), pdfs.end(),
-                               trans_model.TransitionStateToPdf(tstate)))
+        && !(std::binary_search(pdfs.begin(), pdfs.end(),
+                               trans_model.TransitionStateToForwardPdf(tstate)) &&
+             std::binary_search(pdfs.begin(), pdfs.end(),
+                               trans_model.TransitionStateToSelfLoopPdf(tstate))) )
       return false;
   return true;
 }
 
 bool TransitionModel::Compatible(const TransitionModel &other) const {
-  return (topo_ == other.topo_ && triples_ == other.triples_ &&
+  return (topo_ == other.topo_ && tuples_ == other.tuples_ &&
           state2id_ == other.state2id_ && id2state_ == other.id2state_
           && num_pdfs_ == other.num_pdfs_);
 }
 
+bool TransitionModel::IsSelfLoop(int32 trans_id) const {
+  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size());
+  int32 trans_state = id2state_[trans_id];
+  int32 trans_index = trans_id - state2id_[trans_state];
+  const Tuple &tuple = tuples_[trans_state-1];
+  int32 phone = tuple.phone, hmm_state = tuple.hmm_state;
+  const HmmTopology::TopologyEntry &entry = topo_.TopologyForPhone(phone);
+  KALDI_ASSERT(static_cast<size_t>(hmm_state) < entry.size());
+  return (static_cast<size_t>(trans_index) < entry[hmm_state].transitions.size()
+          && entry[hmm_state].transitions[trans_index].first == hmm_state);
+}
+
 } // End namespace kaldi
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index ff236e6de9e..33a0d55443e 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -53,7 +53,7 @@ namespace kaldi {
 //           phone:  a phone index (1, 2, 3 ...)
 //       HMM-state:  a number (0, 1, 2...) that indexes TopologyEntry (see hmm-topology.h)
 //          pdf-id:  a number output by the Compute function of ContextDependency (it
-//                   indexes pdf's).  Zero-based.
+//                   indexes pdf's, either forward or self-loop).  Zero-based.
 // transition-state:  the states for which we estimate transition probabilities for transitions
 //                    out of them.  In some topologies, will map one-to-one with pdf-ids.
 //                    One-based, since it appears on FSTs.
@@ -66,14 +66,15 @@ namespace kaldi {
 //                    One-based, since it appears on FSTs.
 //
 // List of the possible mappings TransitionModel can do:
-//             (phone, HMM-state, pdf-id) -> transition-state
-//   (transition-state, transition-index) -> transition-id
+//   (phone, HMM-state, forward-pdf-id, self-loop-pdf-id) -> transition-state
+//                   (transition-state, transition-index) -> transition-id
 //  Reverse mappings:
 //                        transition-id -> transition-state
 //                        transition-id -> transition-index
 //                     transition-state -> phone
 //                     transition-state -> HMM-state
-//                     transition-state -> pdf-id
+//                     transition-state -> forward-pdf-id
+//                     transition-state -> self-loop-pdf-id
 //
 // The main things the TransitionModel object can do are:
 //    Get initialized (need ContextDependency and HmmTopology objects).
@@ -141,13 +142,16 @@ class TransitionModel {
   /// \name Integer mapping functions
   /// @{
 
-  int32 TripleToTransitionState(int32 phone, int32 hmm_state, int32 pdf) const;
+  int32 TupleToTransitionState(int32 phone, int32 hmm_state, int32 pdf, int32 self_loop_pdf) const;
   int32 PairToTransitionId(int32 trans_state, int32 trans_index) const;
   int32 TransitionIdToTransitionState(int32 trans_id) const;
   int32 TransitionIdToTransitionIndex(int32 trans_id) const;
   int32 TransitionStateToPhone(int32 trans_state) const;
   int32 TransitionStateToHmmState(int32 trans_state) const;
-  int32 TransitionStateToPdf(int32 trans_state) const;
+  int32 TransitionStateToForwardPdfClass(int32 trans_state) const;
+  int32 TransitionStateToSelfLoopPdfClass(int32 trans_state) const;
+  int32 TransitionStateToForwardPdf(int32 trans_state) const;
+  int32 TransitionStateToSelfLoopPdf(int32 trans_state) const;
   int32 SelfLoopOf(int32 trans_state) const;  // returns the self-loop transition-id, or zero if
   // this state doesn't have a self-loop.
 
@@ -172,7 +176,7 @@ class TransitionModel {
   int32 NumTransitionIndices(int32 trans_state) const;
 
   /// Returns the total number of transition-states (note, these are one-based).
-  int32 NumTransitionStates() const { return triples_.size(); }
+  int32 NumTransitionStates() const { return tuples_.size(); }
 
   // NumPdfs() actually returns the highest-numbered pdf we ever saw, plus one.
   // In normal cases this should equal the number of pdfs in the system, but if you
@@ -249,30 +253,36 @@ class TransitionModel {
   void MapUpdateShared(const Vector<double> &stats,
                        const MapTransitionUpdateConfig &cfg,
                        BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void ComputeTriples(const ContextDependencyInterface &ctx_dep);  // called from constructor.  initializes triples_.
+  void ComputeTuples(const ContextDependencyInterface &ctx_dep);  // called from constructor.  initializes tuples_.
+  void ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_dep);
+  void ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_dep);
   void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
   void ComputeDerivedOfProbs();  // computes quantities derived from log-probs (currently just
   // non_self_loop_log_probs_; called whenever log-probs change.
   void InitializeProbs();  // called from constructor.
   void Check() const;
+  bool IsHmm() const;
 
-  struct Triple {
+  struct Tuple {
     int32 phone;
     int32 hmm_state;
-    int32 pdf;
-    Triple() { }
-    Triple(int32 phone, int32 hmm_state, int32 pdf):
-        phone(phone), hmm_state(hmm_state), pdf(pdf) { }
-    bool operator < (const Triple &other) const {
+    int32 forward_pdf;
+    int32 self_loop_pdf;
+    Tuple() { }
+    Tuple(int32 phone, int32 hmm_state, int32 forward_pdf, int32 self_loop_pdf):
+      phone(phone), hmm_state(hmm_state), forward_pdf(forward_pdf), self_loop_pdf(self_loop_pdf) { }
+    bool operator < (const Tuple &other) const {
       if (phone < other.phone) return true;
       else if (phone > other.phone) return false;
       else if (hmm_state < other.hmm_state) return true;
       else if (hmm_state > other.hmm_state) return false;
-      else return pdf < other.pdf;
+      else if (forward_pdf < other.forward_pdf) return true;
+      else if (forward_pdf > other.forward_pdf) return false;
+      else return (self_loop_pdf < other.self_loop_pdf);
     }
-    bool operator == (const Triple &other) const {
+    bool operator == (const Tuple &other) const {
       return (phone == other.phone && hmm_state == other.hmm_state
-              && pdf == other.pdf);
+              && forward_pdf == other.forward_pdf && self_loop_pdf == other.self_loop_pdf);
     }
   };
 
@@ -281,7 +291,7 @@ class TransitionModel {
   /// Triples indexed by transition state minus one;
   /// the triples are in sorted order which allows us to do the reverse mapping from
   /// triple to transition state
-  std::vector<Triple> triples_;
+  std::vector<Tuple> tuples_;
 
   /// Gives the first transition_id of each transition-state; indexed by
   /// the transition-state.  Array indexed 1..num-transition-states+1 (the last one
@@ -292,6 +302,8 @@ class TransitionModel {
   /// state (indexed by transition-id).
   std::vector<int32> id2state_;
 
+  std::vector<int32> id2pdf_id_;
+
   /// For each transition-id, the corresponding log-prob.  Indexed by transition-id.
   Vector<BaseFloat> log_probs_;
 
@@ -310,12 +322,9 @@ class TransitionModel {
 };
 
 inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
-  // If a lot of time is spent here we may create an extra array
-  // to handle this.
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size() &&
+  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
                "Likely graph/model mismatch (graph built from wrong model?)");
-  int32 trans_state = id2state_[trans_id];
-  return triples_[trans_state-1].pdf;
+  return id2pdf_id_[trans_id];
 }
 
 /// Works out which pdfs might correspond to the given phones.  Will return true
diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h
index b989dd900ea..40681bb5ccd 100644
--- a/src/itf/context-dep-itf.h
+++ b/src/itf/context-dep-itf.h
@@ -63,9 +63,36 @@ class ContextDependencyInterface {
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
   /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
-  virtual void GetPdfInfo(const std::vector<int32> &phones,  // list of phones
-                          const std::vector<int32> &num_pdf_classes,  // indexed by phone,
-                          std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
+  /// This is the old, simpler interface of GetPdfInfo(), and that this one can
+  /// only be called if the HmmTopology object's IsHmm() function call returns
+  /// true.
+  virtual void GetPdfInfo(
+      const std::vector<int32> &phones,  // list of phones
+      const std::vector<int32> &num_pdf_classes,  // indexed by phone,
+      std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
+      const = 0;
+
+  /// This function outputs information about what possible pdf-ids can
+  /// be generated for HMM-states; it covers the general case where
+  /// the self-loop pdf-class may be different from the forward-transition
+  /// pdf-class, so we are asking not about the set of possible pdf-ids
+  /// for a given (phone, pdf-class), but the set of possible ordered pairs
+  /// (forward-transition-pdf, self-loop-pdf) for a given (phone,
+  /// forward-transition-pdf-class, self-loop-pdf-class).
+  /// Note: 'phones' is a list of integer ids of phones, and
+  /// 'pdf-class-pairs', indexed by phone, is a list of pairs
+  /// (forward-transition-pdf-class, self-loop-pdf-class) that we can have for
+  /// that phone.
+  /// The output 'pdf_info' is indexed first by phone and then by the
+  /// same index that indexes each element of 'pdf_class_pairs',
+  /// and tells us for each pair in 'pdf_class_pairs', what is the
+  /// list of possible (forward-transition-pdf-id, self-loop-pdf-id) that
+  /// we can have.
+  /// This is less efficient than the other version of GetPdfInfo().
+  virtual void GetPdfInfo(
+      const std::vector<int32> &phones,
+      const std::vector<std::vector<std::pair<int32, int32> > > &pdf_class_pairs,
+      std::vector<std::vector<std::vector<std::pair<int32, int32> > > > *pdf_info)
       const = 0;
 
 
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
index 7f1edd7b44b..5b5421873c4 100644
--- a/src/lm/arpa-file-parser-test.cc
+++ b/src/lm/arpa-file-parser-test.cc
@@ -219,13 +219,13 @@ ngram 1=4\n\
 ngram 2=2\n\
 ngram 3=2\n\
 \n\
-\\1-grams:\n\
+\\1-grams: \n\
 -5.2	a -3.3\n\
 -3.4	\xCE\xB2\n\
 0.0	<s> -2.5\n\
 -4.3	</s>\n\
 \n\
-\\2-grams:\n\
+\\2-grams:\t\n\
 -1.5	a \xCE\xB2 -3.2\n\
 -1.3	<s> a -4.2\n\
 \n\
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
index 49b425adca4..d3307a477c0 100644
--- a/src/lm/arpa-file-parser.cc
+++ b/src/lm/arpa-file-parser.cc
@@ -18,10 +18,10 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sstream>
-
 #include <fst/fstlib.h>
 
+#include <sstream>
+
 #include "base/kaldi-error.h"
 #include "base/kaldi-math.h"
 #include "lm/arpa-file-parser.h"
@@ -38,6 +38,10 @@ ArpaFileParser::ArpaFileParser(ArpaParseOptions options,
 ArpaFileParser::~ArpaFileParser() {
 }
 
+void TrimTrailingWhitespace(std::string *str) {
+  str->erase(str->find_last_not_of(" \n\r\t") + 1);
+}
+
 void ArpaFileParser::Read(std::istream &is, bool binary) {
   if (binary) {
     KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
@@ -84,6 +88,8 @@ void ArpaFileParser::Read(std::istream &is, bool binary) {
   while (++line_number_, getline(is, current_line_) && !is.eof()) {
     if (current_line_.empty()) continue;
 
+    TrimTrailingWhitespace(&current_line_);
+
     // Continue skipping lines until the \data\ marker alone on a line is found.
     if (!keyword_found) {
       if (current_line_ == "\\data\\") {
@@ -147,7 +153,28 @@ void ArpaFileParser::Read(std::istream &is, bool binary) {
     int32 ngram_count = 0;
     while (++line_number_, getline(is, current_line_) && !is.eof()) {
       if (current_line_.empty()) continue;
-      if (current_line_[0] == '\\') break;
+      if (current_line_[0] == '\\') {
+        TrimTrailingWhitespace(&current_line_);
+        std::ostringstream next_keyword;
+        next_keyword << "\\" << cur_order + 1 << "-grams:";
+        if ((current_line_ != next_keyword.str()) &&
+            (current_line_ != "\\end\\")) {
+          if (ShouldWarn()) {
+            KALDI_WARN << "ignoring possible directive '" << current_line_
+                       << "' expecting '" << next_keyword.str() << "'";
+
+            if (warning_count_ > 0 &&
+                warning_count_ > static_cast<uint32>(options_.max_warnings)) {
+              KALDI_WARN << "Of " << warning_count_ << " parse warnings, "
+                         << options_.max_warnings << " were reported. "
+                         << "Run program with --max_warnings=-1 "
+                         << "to see all warnings";
+            }
+          }
+        } else {
+          break;
+        }
+      }
 
       std::vector<std::string> col;
       SplitStringToVector(current_line_, " \t", true, &col);
@@ -183,7 +210,7 @@ void ArpaFileParser::Read(std::istream &is, bool binary) {
           } else {
             word = symbols_->Find(col[1 + index]);
             if (word == fst::SymbolTable::kNoSymbol) {
-              switch(options_.oov_handling) {
+              switch (options_.oov_handling) {
                 case ArpaParseOptions::kReplaceWithUnk:
                   word = options_.unk_symbol;
                   break;
@@ -227,7 +254,8 @@ void ArpaFileParser::Read(std::istream &is, bool binary) {
     PARSE_ERR << "invalid or unexpected directive line, expecting \\end\\";
   }
 
-  if (warning_count_ > 0 && warning_count_ > (uint32)options_.max_warnings) {
+  if (warning_count_ > 0 &&
+      warning_count_ > static_cast<uint32>(options_.max_warnings)) {
     KALDI_WARN << "Of " << warning_count_ << " parse warnings, "
                << options_.max_warnings << " were reported. Run program with "
                << "--max_warnings=-1 to see all warnings";
@@ -246,7 +274,7 @@ std::string ArpaFileParser::LineReference() const {
 }
 
 bool ArpaFileParser::ShouldWarn() {
- return ++warning_count_ <= (uint32)options_.max_warnings;
+  return ++warning_count_ <= static_cast<uint32>(options_.max_warnings);
 }
 
 }  // namespace kaldi
diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc
index 14378aa374c..634a6267c4e 100644
--- a/src/lm/arpa-lm-compiler.cc
+++ b/src/lm/arpa-lm-compiler.cc
@@ -318,21 +318,35 @@ void ArpaLmCompiler::ConsumeNGram(const NGram &ngram) {
 
 void ArpaLmCompiler::RemoveRedundantStates() {
   fst::StdArc::Label backoff_symbol = sub_eps_;
+  if (backoff_symbol == 0) {
+    // The method of removing redundant states implemented in this function
+    // leads to slow determinization of L o G when people use the older style of
+    // usage of arpa2fst where the --disambig-symbol option was not specified.
+    // The issue seems to be that it creates a non-deterministic FST, while G is
+    // supposed to be deterministic.  By 'return'ing below, we just disable this
+    // method if people were using an older script.  This method isn't really
+    // that consequential anyway, and people will move to the newer-style
+    // scripts (see current utils/format_lm.sh), so this isn't much of a
+    // problem.
+    return;
+  }
+
   fst::StdArc::StateId num_states = fst_.NumStates();
+
+
   // replace the #0 symbols on the input of arcs out of redundant states (states
   // that are not final and have only a backoff arc leaving them), with <eps>.
-  if (backoff_symbol != 0) {
-    for (fst::StdArc::StateId state = 0; state < num_states; state++) {
-      if (fst_.NumArcs(state) == 1 && fst_.Final(state) == fst::TropicalWeight::Zero()) {
-        fst::MutableArcIterator<fst::StdVectorFst> iter(&fst_, state);
-        fst::StdArc arc = iter.Value();
-        if (arc.ilabel == backoff_symbol) {
-          arc.ilabel = 0;
-          iter.SetValue(arc);
-        }
+  for (fst::StdArc::StateId state = 0; state < num_states; state++) {
+    if (fst_.NumArcs(state) == 1 && fst_.Final(state) == fst::TropicalWeight::Zero()) {
+      fst::MutableArcIterator<fst::StdVectorFst> iter(&fst_, state);
+      fst::StdArc arc = iter.Value();
+      if (arc.ilabel == backoff_symbol) {
+        arc.ilabel = 0;
+        iter.SetValue(arc);
       }
     }
   }
+
   // we could call fst::RemoveEps, and it would have the same effect in normal
   // cases, where backoff_symbol != 0 and there are no epsilons in unexpected
   // places, but RemoveEpsLocal is a bit safer in case something weird is going
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index 9608a5475e0..f807529159e 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -572,14 +572,7 @@ void NormalizeComponent::Propagate(const ChunkInfo &in_info,
                                    const ChunkInfo &out_info,
                                    const CuMatrixBase<BaseFloat> &in,
                                    CuMatrixBase<BaseFloat> *out) const  {
-  out->CopyFromMat(in);
-
-  CuVector<BaseFloat> in_norm(in.NumRows());
-  in_norm.AddDiagMat2(1.0 / in.NumCols(),
-                      in, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
-  in_norm.ApplyPow(-0.5);
-  out->MulRowsVec(in_norm);
+  cu::NormalizePerRow(in, BaseFloat(1), false, out);
 }
 
 /*
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index d58a58e6f2b..cee9e8f9bd7 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -156,13 +156,18 @@ void Compiler::ComputeDerivNeeded(
       if (request_.outputs[output_index].has_deriv)
         (*deriv_needed)[step] = true;
     }
-    // If this is an updatable Component node and the user requested model
-    // derivatives (e.g. during training), we need this step's derivative.
+    // If this is an updatable Component node with a nonzero learning rate and
+    // the user requested model derivatives (e.g. during training), we need this
+    // step's derivative.
     if (nnet_.IsComponentNode(node_index) && request_.need_model_derivative) {
       const NetworkNode &node = nnet_.GetNode(node_index);
       const Component *c = nnet_.GetComponent(node.u.component_index);
-      if (c->Properties() & kUpdatableComponent)
-        (*deriv_needed)[step] = true;
+      if (c->Properties() & kUpdatableComponent) {
+        const UpdatableComponent *u = dynamic_cast<const UpdatableComponent*>(c);
+        KALDI_ASSERT(u != NULL);
+        if (u->LearningRate() != 0)
+          (*deriv_needed)[step] = true;
+      }
     }
   }
   if (GetVerboseLevel() >= 5) {
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 5c38d125c98..b40670407c8 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -55,6 +55,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new StatisticsExtractionComponentPrecomputedIndexes();
   } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") {
     ans = new StatisticsPoolingComponentPrecomputedIndexes();
+  } else if (cpi_type == "BackpropTruncationComponentPrecomputedIndexes") {
+    ans = new BackpropTruncationComponentPrecomputedIndexes();
   }
   if (ans != NULL) {
     KALDI_ASSERT(cpi_type == ans->Type());
@@ -143,6 +145,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new ConstantFunctionComponent();
   } else if (component_type == "DropoutComponent") {
     ans = new DropoutComponent();
+  } else if (component_type == "BackpropTruncationComponent") {
+    ans = new BackpropTruncationComponent();
   }
   if (ans != NULL) {
     KALDI_ASSERT(component_type == ans->Type());
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 80793bf1d98..f5687ec1d71 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -880,6 +880,291 @@ void StatisticsPoolingComponent::Backprop(
                    indexes->backward_indexes);
 }
 
+// virtual
+void BackpropTruncationComponent::Read(std::istream &is, bool binary) {
+  // might not see the "<NaturalGradientAffineComponent>" part because
+  // of how ReadNew() works.
+  ExpectOneOrTwoTokens(is, binary, "<BackpropTruncationComponent>",
+                       "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<ClippingThreshold>");
+  ReadBasicType(is, binary, &clipping_threshold_);
+  ExpectToken(is, binary, "<ZeroingThreshold>");
+  ReadBasicType(is, binary, &zeroing_threshold_);
+  ExpectToken(is, binary, "<ZeroingInterval>");
+  ReadBasicType(is, binary, &zeroing_interval_);
+  ExpectToken(is, binary, "<RecurrenceInterval>");
+  ReadBasicType(is, binary, &recurrence_interval_);
+  ExpectToken(is, binary, "<NumElementsClipped>");
+  ReadBasicType(is, binary, &num_clipped_);
+  ExpectToken(is, binary, "<NumElementsZeroed>");
+  ReadBasicType(is, binary, &num_zeroed_);
+  ExpectToken(is, binary, "<NumElementsProcessed>");
+  ReadBasicType(is, binary, &count_);
+  ExpectToken(is, binary, "<NumZeroingBoundaries>");
+  ReadBasicType(is, binary, &count_zeroing_boundaries_);
+  ExpectToken(is, binary, "</BackpropTruncationComponent>");
+}
+
+// virtual
+void BackpropTruncationComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<BackpropTruncationComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<ClippingThreshold>");
+  WriteBasicType(os, binary, clipping_threshold_);
+  WriteToken(os, binary, "<ZeroingThreshold>");
+  WriteBasicType(os, binary, zeroing_threshold_);
+  WriteToken(os, binary, "<ZeroingInterval>");
+  WriteBasicType(os, binary, zeroing_interval_);
+  WriteToken(os, binary, "<RecurrenceInterval>");
+  WriteBasicType(os, binary, recurrence_interval_);
+  WriteToken(os, binary, "<NumElementsClipped>");
+  WriteBasicType(os, binary, num_clipped_);
+  WriteToken(os, binary, "<NumElementsZeroed>");
+  WriteBasicType(os, binary, num_zeroed_);
+  WriteToken(os, binary, "<NumElementsProcessed>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<NumZeroingBoundaries>");
+  WriteBasicType(os, binary, count_zeroing_boundaries_);
+  WriteToken(os, binary, "</BackpropTruncationComponent>");
+}
+
+void BackpropTruncationComponentPrecomputedIndexes::Write(std::ostream &ostream,
+    bool binary) const {
+  WriteToken(ostream, binary,
+             "<BackpropTruncationComponentPrecomputedIndexes>");
+  WriteToken(ostream, binary, "<Zeroing>");
+  zeroing.Write(ostream, binary);
+  WriteToken(ostream, binary, "<ZeroingSum>");
+  WriteBasicType(ostream, binary, zeroing_sum);
+  WriteToken(ostream, binary,
+             "</BackpropTruncationComponentPrecomputedIndexes>");
+}
+
+void BackpropTruncationComponentPrecomputedIndexes::Read(std::istream &istream,
+    bool binary) {
+  ExpectOneOrTwoTokens(istream, binary,
+                       "<BackpropTruncationComponentPrecomputedIndexes>",
+                       "<Zeroing>");
+  zeroing.Read(istream, binary);
+  ExpectToken(istream, binary, "<ZeroingSum>");
+  ReadBasicType(istream, binary, &zeroing_sum);
+  ExpectToken(istream, binary,
+              "</BackpropTruncationComponentPrecomputedIndexes>");
+}
+
+std::string BackpropTruncationComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_
+         << ", clipping-threshold=" << clipping_threshold_
+         << ", clipped-proportion="
+         << (count_ > 0.0 ? num_clipped_ / count_ : 0)
+         << ", zeroing-threshold=" << zeroing_threshold_
+         << ", zeroed-proportion="
+         << (count_zeroing_boundaries_ > 0.0 ?
+             num_zeroed_ / count_zeroing_boundaries_ : 0)
+         << ", count-zeroing-boundaries="
+         << static_cast<int32>(count_zeroing_boundaries_);
+  return stream.str();
+}
+
+void BackpropTruncationComponent::Init(int32 dim,
+                                 BaseFloat clipping_threshold,
+                                 BaseFloat zeroing_threshold,
+                                 int32 zeroing_interval,
+                                 int32 recurrence_interval) {
+  KALDI_ASSERT(clipping_threshold >= 0 && zeroing_threshold >= 0 &&
+      zeroing_interval > 0 && recurrence_interval > 0 && dim > 0);
+  dim_ = dim;
+  clipping_threshold_ = clipping_threshold;
+  zeroing_threshold_ = zeroing_threshold;
+  zeroing_interval_ = zeroing_interval;
+  recurrence_interval_ = recurrence_interval;
+  num_clipped_ = 0.0;
+  num_zeroed_ = 0.0;
+  count_ = 0.0;
+  count_zeroing_boundaries_ = 0.0;
+}
+
+// virtual
+void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 dim = 0;
+  bool ok = cfl->GetValue("dim", &dim);
+  BaseFloat clipping_threshold = 15.0;
+  BaseFloat zeroing_threshold = 2.0;
+  int32 zeroing_interval = 20, recurrence_interval = 1;
+  cfl->GetValue("clipping-threshold", &clipping_threshold);
+  cfl->GetValue("zeroing-threshold", &zeroing_threshold);
+  cfl->GetValue("zeroing-interval", &zeroing_interval);
+  cfl->GetValue("recurrence-interval", &recurrence_interval);
+  if (!ok || cfl->HasUnusedValues() ||
+      clipping_threshold < 0 || zeroing_threshold < 0 || zeroing_interval < 1 ||
+      recurrence_interval < 1 || dim <= 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(dim, clipping_threshold, zeroing_threshold,
+      zeroing_interval, recurrence_interval);
+}
+
+// virtual 
+Component* BackpropTruncationComponent::Copy() const {
+  BackpropTruncationComponent *ans = new BackpropTruncationComponent();
+  ans->dim_ = dim_;
+  ans->clipping_threshold_ = clipping_threshold_;
+  ans->zeroing_threshold_ = zeroing_threshold_;
+  ans->zeroing_interval_ = zeroing_interval_;
+  ans->recurrence_interval_ = recurrence_interval_;
+  ans->num_clipped_ = num_clipped_;
+  ans->num_zeroed_ = num_zeroed_;
+  ans->count_ = count_;
+  ans->count_zeroing_boundaries_ = count_zeroing_boundaries_;
+  return ans;
+}
+
+// virtual
+ComponentPrecomputedIndexes*
+BackpropTruncationComponent::PrecomputeIndexes(
+    const MiscComputationInfo &misc_info,
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool need_backprop) const {
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  KALDI_ASSERT(num_input_indexes == num_output_indexes);
+  Vector<BaseFloat> zeroing_cpu(num_output_indexes);
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    const int32 output_n = output_indexes[i].n;
+    const int32 output_t = output_indexes[i].t;
+    // checks if output_t crosses a boundary that is a multiple of
+    // zeroing_interval_. Note that frame (output_t - recurrence_interval_) is
+    // right before frame output_t in RNNs. If the range
+    // [output_t - recurrence_interval_, output_t] contains a multiple of
+    // zeroing_interval_, then frame output_t crosses the boundary.
+    // output_n is used to shift where we put the boundary, so that
+    // we don't always zero out gradients on frame 0. It will help avoid
+    // learning utterance-boundary effects.
+    if (DivideRoundingDown(output_t - output_n, zeroing_interval_) !=
+        DivideRoundingDown(output_t - recurrence_interval_ - output_n,
+        zeroing_interval_))
+      zeroing_cpu(i) = -1.0;
+  }
+
+  BackpropTruncationComponentPrecomputedIndexes *ans = new
+      BackpropTruncationComponentPrecomputedIndexes();
+  ans->zeroing = zeroing_cpu;
+  ans->zeroing_sum = -zeroing_cpu.Sum();
+  return ans;
+}
+
+// virtual
+void BackpropTruncationComponent::Propagate(
+                                 const ComponentPrecomputedIndexes *indexes,
+                                 const CuMatrixBase<BaseFloat> &in,
+                                 CuMatrixBase<BaseFloat> *out) const {
+  out->CopyFromMat(in);
+}
+
+// virtual
+void BackpropTruncationComponent::Backprop(const std::string &debug_info,
+                             const ComponentPrecomputedIndexes *indexes_in,
+                             const CuMatrixBase<BaseFloat> &, //in_value
+                             const CuMatrixBase<BaseFloat> &,
+                             const CuMatrixBase<BaseFloat> &out_deriv,
+                             Component *to_update_in, // may be NULL; may be
+                             // identical to "this" or different.
+                             CuMatrixBase<BaseFloat> *in_deriv) const {
+  const BackpropTruncationComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const BackpropTruncationComponentPrecomputedIndexes*>(
+          indexes_in);
+  KALDI_ASSERT(indexes->zeroing.Dim() == out_deriv.NumRows());
+  // the following statement will do nothing if in_deriv and out_deriv have same
+  // memory.
+  in_deriv->CopyFromMat(out_deriv);
+
+  BackpropTruncationComponent *to_update =
+      dynamic_cast<BackpropTruncationComponent*>(to_update_in);
+
+  // computes clipping_scales
+  BaseFloat clipping_threshold =
+      (clipping_threshold_ <= 0.0 ? 1.0e+10 : clipping_threshold_);
+  // each row in the derivative matrix, which corresponds to one sample in
+  // the mini-batch, is scaled to have a max-norm of clipping_threshold_
+  CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
+  clipping_scales.AddDiagMat2(pow(clipping_threshold, -2), *in_deriv,
+                              kNoTrans, 0.0);
+  // now clipping_scales contains the squared (norm of each row divided by
+  //  clipping_threshold)
+  int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
+  // now clipping_scales contains min(1, squared-(norm/clipping_threshold))
+  clipping_scales.ApplyPow(-0.5);
+  // now clipping_scales contains max(1, clipping_threshold/vector_norm)
+  if (to_update != NULL) {
+    to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
+    to_update->count_ += clipping_scales.Dim();
+  }
+
+  // computes zeroing_scales
+  BaseFloat zeroing_threshold =
+      (zeroing_threshold_ <= 0.0 ? 1.0e+10 : zeroing_threshold_);
+  // zeroing_scales_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> zeroing_scales(1, in_deriv->NumRows());
+  CuSubVector<BaseFloat> zeroing_scales_vec(zeroing_scales, 0);
+  zeroing_scales_vec.Set(-pow(zeroing_threshold, 2));
+  // now zeroing_scales_vec contains -(squared zeroing_threshold)
+  zeroing_scales_vec.AddDiagMat2(1.0, *in_deriv, kNoTrans, 1.0);
+  // now zeroing_scales_vec contains squared norm of each row -
+  // squared zeroing_threshold
+  zeroing_scales.ApplyHeaviside();
+  // now the element of zeroing_scales_vec is 1.0 if its corresponding
+  // sample's norm exceeds zero_threshold, and 0.0 otherwise
+  zeroing_scales_vec.MulElements(indexes->zeroing);
+  // now the element of zeroing_scales_vec is -1.0 if we want to zero its
+  // corresponding sample's gradient, and 0.0 otherwise
+  if (to_update != NULL) {
+    to_update->num_zeroed_ -= zeroing_scales_vec.Sum(); // since it is negative
+    to_update->count_zeroing_boundaries_ += indexes->zeroing_sum;
+  }
+  zeroing_scales_vec.Add(1.0);
+  // now the element of zeroing_scales_vec is 0.0 if we want to zero its
+  // corresponding sample's gradient, and 1.0 otherwise
+  
+  // combines clipping_scales and zeroing_scales and applies combined_scales
+  // to in_deriv all at once
+  CuVector<BaseFloat> combined_scales(clipping_scales);
+  combined_scales.MulElements(zeroing_scales_vec);
+  in_deriv->MulRowsVec(combined_scales);
+}
+
+// virtual
+void BackpropTruncationComponent::ZeroStats()  {
+  count_ = 0.0;
+  count_zeroing_boundaries_ = 0.0;
+  num_clipped_ = 0.0;
+  num_zeroed_ = 0.0;
+}
+
+// virtual
+void BackpropTruncationComponent::Scale(BaseFloat scale) {
+  count_ *= scale;
+  count_zeroing_boundaries_ *= scale;
+  num_clipped_ *= scale;
+  num_zeroed_ *= scale;
+}
+
+// virtual
+void BackpropTruncationComponent::Add(BaseFloat alpha,
+                                      const Component &other_in) {
+  const BackpropTruncationComponent *other =
+      dynamic_cast<const BackpropTruncationComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  count_ += alpha * other->count_;
+  count_zeroing_boundaries_ += alpha * other->count_zeroing_boundaries_;
+  num_clipped_ += alpha * other->num_clipped_;
+  num_zeroed_ += alpha * other->num_zeroed_;
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 5e94d4ba332..93a46eaedbf 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -440,6 +440,139 @@ class StatisticsPoolingComponentPrecomputedIndexes:
   virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; }
 };
 
+// BackpropTruncationComponent zeroes out the gradients every certain number
+// of frames, as well as having gradient-clipping functionality as 
+// ClipGradientComponent.
+// This component will be used to prevent gradient explosion problem in
+// recurrent neural networks
+class BackpropTruncationComponent: public Component {
+ public:
+  BackpropTruncationComponent(int32 dim,
+                              BaseFloat clipping_threshold,
+                              BaseFloat zeroing_threshold,
+                              int32 zeroing_interval,
+                              int32 recurrence_interval) {
+    Init(dim, clipping_threshold, zeroing_threshold,
+        zeroing_interval, recurrence_interval);}
+
+  BackpropTruncationComponent(): dim_(0), clipping_threshold_(-1),
+    zeroing_threshold_(-1), zeroing_interval_(0), recurrence_interval_(0),
+    num_clipped_(0), num_zeroed_(0), count_(0), count_zeroing_boundaries_(0) { }
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  void Init(int32 dim, BaseFloat clipping_threshold,
+            BaseFloat zeroing_threshold, int32 zeroing_interval,
+            int32 recurrence_interval);
+
+  virtual std::string Type() const { return "BackpropTruncationComponent"; }
+
+  virtual int32 Properties() const {
+    return kLinearInInput|kPropagateInPlace|kBackpropInPlace;
+  }
+
+  virtual void ZeroStats();
+
+  virtual Component* Copy() const;
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+  virtual ~BackpropTruncationComponent() {
+  }
+ private:
+  // input/output dimension
+  int32 dim_;
+  
+  // threshold (e.g., 30) to be used for clipping corresponds to max-row-norm
+  BaseFloat clipping_threshold_;
+
+  // threshold (e.g., 3) to be used for zeroing corresponds to max-row-norm
+  BaseFloat zeroing_threshold_;
+
+  // interval (e.g., 20, in number of frames) at which we would zero the
+  // gradient if the norm of the gradient is above zeroing_threshold_
+  int32 zeroing_interval_;
+
+  // recurrence_interval_ should be the absolute recurrence offset used in RNNs
+  // (e.g., 3). It is used to see whether the index the component is processing,
+  // crosses a boundary that's a multiple of zeroing_interval_ frames.
+  int32 recurrence_interval_;
+
+  // component-node name, used in the destructor to print out stats of
+  // self-repair
+  std::string debug_info_;
+
+  BackpropTruncationComponent &operator =
+      (const BackpropTruncationComponent &other); // Disallow.
+
+ protected:
+  // variables to store stats
+  // An element corresponds to rows of derivative matrix
+  double num_clipped_;  // number of elements which were clipped
+  double num_zeroed_;   // number of elements which were zeroed
+  double count_;  // number of elements which were processed
+  double count_zeroing_boundaries_; // number of zeroing boundaries where we had
+                                    // the opportunity to perform zeroing
+                                    // the gradient
+
+};
+
+class BackpropTruncationComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // zeroing has the same dimension as the number of rows of out-deriv.
+  // Each element in zeroing can take two possible values: -1.0, meaning its
+  // corresponding frame is one that we need to consider zeroing the
+  // gradient of, and 0.0 otherwise
+  CuVector<BaseFloat> zeroing;
+
+  // caches the negative sum of elements in zeroing for less CUDA calls
+  // (the sum is computed by CPU). Note that this value would be positive.
+  BaseFloat zeroing_sum;
+
+  BackpropTruncationComponentPrecomputedIndexes(): zeroing_sum(0.0) {}
+
+  // this class has a virtual destructor so it can be deleted from a pointer
+  // to ComponentPrecomputedIndexes.
+  virtual ~BackpropTruncationComponentPrecomputedIndexes() { }
+
+  virtual ComponentPrecomputedIndexes* Copy() const {
+    return new BackpropTruncationComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &ostream, bool binary) const;
+
+  virtual void Read(std::istream &istream, bool binary);
+
+  virtual std::string Type() const {
+    return "BackpropTruncationComponentPrecomputedIndexes";
+  }
+};
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index af2147147d7..ad5f715a294 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -783,6 +783,13 @@ Nnet& Nnet::operator =(const Nnet &nnet) {
 
 std::string Nnet::Info() const {
   std::ostringstream os;
+
+  if(IsSimpleNnet(*this))  {
+    int32 left_context, right_context;
+    ComputeSimpleNnetContext(*this, &left_context, &right_context);
+    os << "left-context: " << left_context << "\n";
+    os << "right-context: " << right_context << "\n";
+  }
   os << "num-parameters: " << NumParameters(*this) << "\n";
   os << "modulus: " << this->Modulus() << "\n";
   std::vector<std::string> config_lines;
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 733d162748e..3bacf455f3b 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -427,7 +427,7 @@ bool IsValidName(const std::string &name) {
   for (size_t i = 0; i < name.size(); i++) {
     if (i == 0 && !isalpha(name[i]) && name[i] != '_')
       return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-')
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
       return false;
   }
   return true;
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index b84ac90c76e..390ab2885a9 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -96,16 +96,16 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat dropout_proportion = 0.0;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
-  if (!ok || cfl->HasUnusedValues() || dim <= 0 || 
+  if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
-    KALDI_ERR << "Invalid initializer for layer of type " 
-              << Type() << ": \"" << cfl->WholeLine() << "\"";   
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
   Init(dim, dropout_proportion);
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
-  stream << Type() << ", dim = " << dim_ 
+  stream << Type() << ", dim = " << dim_
          << ", dropout-proportion = " << dropout_proportion_;
   return stream.str();
 }
@@ -119,12 +119,12 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
 
-  // This const_cast is only safe assuming you don't attempt  
+  // This const_cast is only safe assuming you don't attempt
   // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out); 
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-  out->Add(-dropout); // now, a proportion "dropout" will be <0.0 
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will 
+  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
                          // be zero and (1 - dropout) will be 1.0.
 
   out->MulElements(in);
@@ -147,7 +147,7 @@ void DropoutComponent::Backprop(const std::string &debug_info,
 }
 
 
- 
+
 void DropoutComponent::Read(std::istream &is, bool binary) {
   ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
   ReadBasicType(is, binary, &dim_);
@@ -415,21 +415,7 @@ void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                    const CuMatrixBase<BaseFloat> &in,
                                    CuMatrixBase<BaseFloat> *out) const {
   KALDI_ASSERT(out->NumCols() == in.NumCols() + (add_log_stddev_ ? 1 : 0));
-  CuSubMatrix<BaseFloat> out_no_log(*out, 0, out->NumRows(), 0, input_dim_);
-  if (in.Data() != out_no_log.Data())
-    out_no_log.CopyFromMat(in);
-  CuVector<BaseFloat> in_norm(in.NumRows());
-  BaseFloat d_scaled = in.NumCols() * target_rms_ * target_rms_;
-  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kSquaredNormFloor);
-  in_norm.ApplyPow(-0.5);
-  out_no_log.MulRowsVec(in_norm);
-  if (add_log_stddev_) {
-    in_norm.ApplyLog();
-    in_norm.Scale(-1.0);
-    in_norm.Add(log(target_rms_));
-    out->CopyColFromVec(in_norm, in.NumCols());
-  }
+  cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out);
 }
 
 /*
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index e02ae4974c9..0b000b5b4ef 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -513,6 +513,178 @@ void GenerateConfigSequenceLstm(
   configs->push_back(os.str());
 }
 
+void GenerateConfigSequenceLstmWithTruncation(
+    const NnetGenerationOptions &opts,
+    std::vector<std::string> *configs) {
+  std::ostringstream os;
+
+  std::vector<int32> splice_context;
+  for (int32 i = -5; i < 4; i++)
+    if (Rand() % 3 == 0)
+      splice_context.push_back(i);
+  if (splice_context.empty())
+    splice_context.push_back(0);
+
+  int32 input_dim = 10 + Rand() % 20,
+      spliced_dim = input_dim * splice_context.size(),
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
+      cell_dim = 40 + Rand() % 50,
+      projection_dim = std::ceil(cell_dim / (Rand() % 10 + 1));
+  int32 clipping_threshold = RandInt(6, 50),
+      zeroing_threshold = RandInt(1,  5),
+      zeroing_interval = RandInt(1, 5) * 10;
+
+  os << "input-node name=input dim=" << input_dim << std::endl;
+
+  // Parameter Definitions W*(* replaced by - to have valid names)
+  // Input gate control : Wi* matrices
+  os << "component name=Wi-xr type=NaturalGradientAffineComponent"
+     << " input-dim=" << spliced_dim + projection_dim
+     << " output-dim=" << cell_dim << std::endl;
+  os << "component name=Wic type=PerElementScaleComponent "
+     << " dim=" << cell_dim << std::endl;
+
+  // Forget gate control : Wf* matrices
+  os << "component name=Wf-xr type=NaturalGradientAffineComponent"
+     << " input-dim=" << spliced_dim + projection_dim
+     << " output-dim=" << cell_dim << std::endl;
+  os << "component name=Wfc type=PerElementScaleComponent "
+     << " dim=" << cell_dim << std::endl;
+
+  // Output gate control : Wo* matrices
+  os << "component name=Wo-xr type=NaturalGradientAffineComponent"
+     << " input-dim=" << spliced_dim + projection_dim
+     << " output-dim=" << cell_dim  << std::endl;
+  os << "component name=Woc type=PerElementScaleComponent "
+     << " dim=" << cell_dim << std::endl;
+
+  // Cell input matrices : Wc* matrices
+  os << "component name=Wc-xr type=NaturalGradientAffineComponent"
+     << " input-dim=" << spliced_dim + projection_dim
+     << " output-dim=" << cell_dim  << std::endl;
+
+
+
+  // projection matrices : Wrm and Wpm
+  os << "component name=W-m type=NaturalGradientAffineComponent "
+     << " input-dim=" << cell_dim
+     << " output-dim=" << 2 * projection_dim << std::endl;
+
+  // Output : Wyr and Wyp
+  os << "component name=Wy- type=NaturalGradientAffineComponent "
+     << " input-dim=" << 2 * projection_dim
+     << " output-dim=" << cell_dim << std::endl;
+
+  // Defining the diagonal matrices
+  // Defining the final affine transform
+  os << "component name=final_affine type=NaturalGradientAffineComponent "
+     << "input-dim=" << cell_dim << " output-dim=" << output_dim << std::endl;
+  os << "component name=logsoftmax type=LogSoftmaxComponent dim="
+     << output_dim << std::endl;
+
+  // Defining the non-linearities
+  //  declare a no-op component so that we can use a sum descriptor's output
+  //  multiple times, and to make the config more readable given the equations
+  os << "component name=i type=SigmoidComponent dim="
+     << cell_dim << std::endl;
+  os << "component name=f type=SigmoidComponent dim="
+     << cell_dim << std::endl;
+  os << "component name=o type=SigmoidComponent dim="
+     << cell_dim << std::endl;
+  os << "component name=g type=TanhComponent dim="
+     << cell_dim << std::endl;
+  os << "component name=h type=TanhComponent dim="
+     << cell_dim << std::endl;
+  os << "component name=c1 type=ElementwiseProductComponent "
+     << " input-dim=" << 2 * cell_dim
+     << " output-dim=" << cell_dim << std::endl;
+  os << "component name=c2 type=ElementwiseProductComponent "
+     << " input-dim=" << 2 * cell_dim
+     << " output-dim=" << cell_dim << std::endl;
+  os << "component name=m type=ElementwiseProductComponent "
+     << " input-dim=" << 2 * cell_dim
+     << " output-dim=" << cell_dim << std::endl;
+  os << "component name=c type=BackpropTruncationComponent dim="
+     << cell_dim
+     << " clipping-threshold=" << clipping_threshold
+     << " zeroing-threshold=" << zeroing_threshold
+     << " zeroing-interval=" << zeroing_interval
+     << " recurrence-interval=1" << std::endl;
+  os << "component name=r type=BackpropTruncationComponent dim="
+     << projection_dim
+     << " clipping-threshold=" << clipping_threshold
+     << " zeroing-threshold=" << zeroing_threshold
+     << " zeroing-interval=" << zeroing_interval
+     << " recurrence-interval=1" << std::endl;
+
+  // Defining the computations
+  std::ostringstream temp_string_stream;
+  for (size_t i = 0; i < splice_context.size(); i++) {
+    int32 offset = splice_context[i];
+    temp_string_stream << "Offset(input, " << offset << ")";
+    if (i + 1 < splice_context.size())
+      temp_string_stream << ", ";
+  }
+  std::string spliced_input = temp_string_stream.str();
+
+  std::string c_tminus1 = "IfDefined(Offset(c_t, -1))";
+  os << "component-node name=c_t component=c input=Sum(c1_t, c2_t)\n";
+
+  // i_t
+  os << "component-node name=i1 component=Wi-xr input=Append("
+     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+  os << "component-node name=i2 component=Wic "
+     << " input=" << c_tminus1 << std::endl;
+  os << "component-node name=i_t component=i input=Sum(i1, i2)\n";
+
+  // f_t
+  os << "component-node name=f1 component=Wf-xr input=Append("
+     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+  os << "component-node name=f2 component=Wfc "
+     << " input=" << c_tminus1 << std::endl;
+  os << "component-node name=f_t component=f input=Sum(f1, f2)\n";
+
+  // o_t
+  os << "component-node name=o1 component=Wo-xr input=Append("
+     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+  os << "component-node name=o2 component=Woc input=Sum(c1_t, c2_t)\n";
+  os << "component-node name=o_t component=o input=Sum(o1, o2)\n";
+
+  // h_t
+  os << "component-node name=h_t component=h input=Sum(c1_t, c2_t)\n";
+
+  // g_t
+  os << "component-node name=g1 component=Wc-xr input=Append("
+     << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
+  os << "component-node name=g_t component=g input=g1\n";
+
+  // parts of c_t
+  os << "component-node name=c1_t component=c1 "
+     << " input=Append(f_t, " << c_tminus1 << ")\n";
+  os << "component-node name=c2_t component=c2 input=Append(i_t, g_t)\n";
+
+  // m_t
+  os << "component-node name=m_t component=m input=Append(o_t, h_t)\n";
+
+  // r_t and p_t
+  os << "component-node name=rp_t component=W-m input=m_t\n";
+  // Splitting outputs of Wy- node
+  os << "dim-range-node name=r_t_pretrunc input-node=rp_t dim-offset=0 "
+     << "dim=" << projection_dim << std::endl;
+  os << "component-node name=r_t component=r input=r_t_pretrunc\n";
+
+  // y_t
+  os << "component-node name=y_t component=Wy- input=rp_t\n";
+
+  // Final affine transform
+  os << "component-node name=final_affine component=final_affine input=y_t\n";
+  os << "component-node name=posteriors component=logsoftmax input=final_affine\n";
+  os << "output-node name=output input=posteriors\n";
+  configs->push_back(os.str());
+}
+
 // This is a different LSTM config where computation is bunched according
 // to inputs this is not complete, it is left here for future comparisons
 void GenerateConfigSequenceLstmType2(
@@ -802,7 +974,7 @@ void GenerateConfigSequence(
     const NnetGenerationOptions &opts,
     std::vector<std::string> *configs) {
 start:
-  int32 network_type = RandInt(0, 10);
+  int32 network_type = RandInt(0, 11);
   switch(network_type) {
     case 0:
       GenerateConfigSequenceSimplest(opts, configs);
@@ -855,6 +1027,12 @@ void GenerateConfigSequence(
     case 10:
       GenerateConfigSequenceStatistics(opts, configs);
       break;
+    case 11:
+      if (!opts.allow_recursion || !opts.allow_context ||
+          !opts.allow_nonlinearity)
+        goto start;
+      GenerateConfigSequenceLstmWithTruncation(opts, configs);
+      break;
     default:
       KALDI_ERR << "Error generating config sequence.";
   }
diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h
index cfd70ccea38..af7c18da64b 100644
--- a/src/nnet3/online-nnet3-decodable-simple.h
+++ b/src/nnet3/online-nnet3-decodable-simple.h
@@ -102,6 +102,7 @@ class DecodableNnet3SimpleOnline: public DecodableInterface {
   /// Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
+  int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; }
  private:
 
   /// If the neural-network outputs for this frame are not cached, it computes
diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
index 7d157f6e89c..e55a213f14f 100644
--- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
@@ -228,8 +228,10 @@ int main(int argc, char *argv[]) {
           }
         }
 
+        // the following constructor takes ownership of the FST pointer so that
+        // it is deleted when 'decoder' is deleted.
         LatticeFasterDecoder *decoder =
-            new LatticeFasterDecoder(fst_reader.Value(), config);
+            new LatticeFasterDecoder(config, fst_reader.Value().Copy());
 
         DecodableInterface *nnet_decodable = new
             DecodableAmNnetSimpleParallel(
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index fd4881666ae..8dd366166c0 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -72,8 +72,9 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
 
 bool SingleUtteranceNnet3Decoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
+  int32 subsample = decodable_.FrameSubsamplingFactor();
   return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds(),
+                                 feature_pipeline_->FrameShiftInSeconds() * subsample,
                                  decoder_);  
 }
 
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
index 41a4175aa3b..f0da85e48de 100644
--- a/src/sgmm2/Makefile
+++ b/src/sgmm2/Makefile
@@ -14,6 +14,6 @@ LIBNAME = kaldi-sgmm2
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/Makefile b/src/transform/Makefile
index 3ae8b1fa3a4..4df681f1ade 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -14,8 +14,8 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
 
 LIBNAME = kaldi-transform
 
-ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \
+          ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 81eee5bb4ee..4eab67f52be 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -178,9 +178,107 @@ void ContextDependency::Read (std::istream &is, bool binary) {
   to_pdf_ = to_pdf;
 }
 
-void ContextDependency::GetPdfInfo(const std::vector<int32> &phones,
-                                   const std::vector<int32> &num_pdf_classes,  // indexed by phone,
-                                   std::vector<std::vector<std::pair<int32, int32> > > *pdf_info) const {
+void ContextDependency::EnumeratePairs(
+    const std::vector<int32> &phones,
+    int32 self_loop_pdf_class, int32 forward_pdf_class,
+    const std::vector<int32> &phone_window,
+    unordered_set<std::pair<int32, int32>, PairHasher<int32> > *pairs) const {
+  std::vector<int32> new_phone_window(phone_window);
+  EventType vec;
+
+  std::vector<EventAnswerType> forward_pdfs, self_loop_pdfs;
+
+  // get list of possible forward pdfs
+  vec.clear();
+  for (size_t i = 0; i < N_; i++)
+    if (phone_window[i] >= 0)
+      vec.push_back(std::make_pair(static_cast<EventKeyType>(i),
+                                   static_cast<EventValueType>(phone_window[i])));
+  vec.push_back(std::make_pair(kPdfClass, static_cast<EventValueType>(forward_pdf_class)));
+  std::sort(vec.begin(), vec.end());
+  to_pdf_->MultiMap(vec, &forward_pdfs);
+  SortAndUniq(&forward_pdfs);
+
+  // get list of possible self-loop pdfs
+  vec.clear();
+  for (size_t i = 0; i < N_; i++)
+    if (phone_window[i] >= 0)
+      vec.push_back(std::make_pair(static_cast<EventKeyType>(i),
+                                   static_cast<EventValueType>(phone_window[i])));
+  vec.push_back(std::make_pair(kPdfClass, static_cast<EventValueType>(self_loop_pdf_class)));
+  std::sort(vec.begin(), vec.end());
+  to_pdf_->MultiMap(vec, &self_loop_pdfs);
+  SortAndUniq(&self_loop_pdfs);
+
+  if (forward_pdfs.size() == 1 || self_loop_pdfs.size() == 1) {
+    for (size_t m = 0; m < forward_pdfs.size(); m++)
+      for (size_t n = 0; n < self_loop_pdfs.size(); n++)
+        pairs->insert(std::make_pair(forward_pdfs[m], self_loop_pdfs[n]));
+  } else {
+    // Choose 'position' as a phone position in 'context' that's currently
+    // -1, and that is as close as possible to the central position P.
+    int32 position = 0;
+    int32 min_dist = N_ - 1;
+    for (int32 i = 0; i < N_; i++) {
+      int32 dist = (P_ - i > 0) ? (P_ - i) : (i - P_);
+      if (phone_window[i] == -1 && dist < min_dist) {
+        position = i;
+        min_dist = dist;
+      }
+    }
+    KALDI_ASSERT(min_dist < N_);
+    KALDI_ASSERT(position != P_);
+
+    // The next two lines have to do with how BOS/EOS effects are handled in
+    // phone context.  Zero phone value in a non-central position (i.e. not
+    // position P_...  and 'position' will never equal P_) means 'there is no
+    // phone here because we're at BOS or EOS'.
+    new_phone_window[position] = 0;
+    EnumeratePairs(phones, self_loop_pdf_class, forward_pdf_class,
+                   new_phone_window, pairs);
+
+    for (size_t i = 0 ; i < phones.size(); i++) {
+      new_phone_window[position] = phones[i];
+      EnumeratePairs(phones, self_loop_pdf_class, forward_pdf_class,
+                     new_phone_window, pairs);
+    }
+  }
+}
+
+void ContextDependency::GetPdfInfo(
+    const std::vector<int32> &phones,
+    const std::vector<std::vector<std::pair<int32, int32> > > &pdf_class_pairs,
+    std::vector<std::vector<std::vector<std::pair<int32, int32> > > > *pdf_info) const {
+
+  KALDI_ASSERT(pdf_info != NULL);
+  pdf_info->resize(1 + *std::max_element(phones.begin(), phones.end()));
+  std::vector<int32> phone_window(N_, -1);
+  EventType vec;
+  for (size_t i = 0 ; i < phones.size(); i++) {
+    // loop over phones
+    int32 phone = phones[i];
+    (*pdf_info)[phone].resize(pdf_class_pairs[phone].size());
+    for (size_t j = 0; j < pdf_class_pairs[phone].size(); j++) {
+      // loop over pdf_class pairs
+      int32 pdf_class = pdf_class_pairs[phone][j].first,
+            self_loop_pdf_class = pdf_class_pairs[phone][j].second;
+      phone_window[P_] = phone;
+
+      unordered_set<std::pair<int32, int32>, PairHasher<int32> > pairs;
+      EnumeratePairs(phones, self_loop_pdf_class, pdf_class, phone_window, &pairs);
+      unordered_set<std::pair<int32, int32>, PairHasher<int32> >::iterator iter = pairs.begin(),
+                           end = pairs.end();
+      for (; iter != end; ++iter)
+        (*pdf_info)[phone][j].push_back(*iter);
+      std::sort( ((*pdf_info)[phone][j]).begin(),  ((*pdf_info)[phone][j]).end());
+    }
+  }
+}
+
+void ContextDependency::GetPdfInfo(
+    const std::vector<int32> &phones,
+    const std::vector<int32> &num_pdf_classes,  // indexed by phone,
+    std::vector<std::vector<std::pair<int32, int32> > > *pdf_info) const {
 
   EventType vec;
   KALDI_ASSERT(pdf_info != NULL);
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index 08dc974570d..6342d89667b 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -20,6 +20,7 @@
 #ifndef KALDI_TREE_CONTEXT_DEP_H_
 #define KALDI_TREE_CONTEXT_DEP_H_
 
+#include "util/stl-utils.h"
 #include "itf/context-dep-itf.h"
 #include "tree/event-map.h"
 #include "matrix/matrix-lib.h"
@@ -99,9 +100,36 @@ class ContextDependency: public ContextDependencyInterface {
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
   /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
-  virtual void GetPdfInfo(const std::vector<int32> &phones,  // list of phones
-                  const std::vector<int32> &num_pdf_classes,  // indexed by phone,
-                  std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
+  /// This is the old, simpler interface of GetPdfInfo(), and that this one can
+  /// only be called if the HmmTopology object's IsHmm() function call returns
+  /// true.
+  virtual void GetPdfInfo(
+      const std::vector<int32> &phones,  // list of phones
+      const std::vector<int32> &num_pdf_classes,  // indexed by phone,
+      std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
+      const;
+
+  /// This function outputs information about what possible pdf-ids can
+  /// be generated for HMM-states; it covers the general case where
+  /// the self-loop pdf-class may be different from the forward-transition
+  /// pdf-class, so we are asking not about the set of possible pdf-ids
+  /// for a given (phone, pdf-class), but the set of possible ordered pairs
+  /// (forward-transition-pdf, self-loop-pdf) for a given (phone,
+  /// forward-transition-pdf-class, self-loop-pdf-class).
+  /// Note: 'phones' is a list of integer ids of phones, and
+  /// 'pdf-class-pairs', indexed by phone, is a list of pairs
+  /// (forward-transition-pdf-class, self-loop-pdf-class) that we can have for
+  /// that phone.
+  /// The output 'pdf_info' is indexed first by phone and then by the
+  /// same index that indexes each element of 'pdf_class_pairs',
+  /// and tells us for each pair in 'pdf_class_pairs', what is the
+  /// list of possible (forward-transition-pdf-id, self-loop-pdf-id) that
+  /// we can have.
+  /// This is less efficient than the other version of GetPdfInfo().
+  virtual void GetPdfInfo(
+      const std::vector<int32> &phones,
+      const std::vector<std::vector<std::pair<int32, int32> > > &pdf_class_pairs,
+      std::vector<std::vector<std::vector<std::pair<int32, int32> > > > *pdf_info)
       const;
 
  private:
@@ -109,6 +137,20 @@ class ContextDependency: public ContextDependencyInterface {
   int32 P_;
   EventMap *to_pdf_;  // owned here.
 
+  // 'context' is the context-window of phones, of
+  // length N, with -1 for those positions where phones 
+  // that are currently unknown, treated as wildcards; at least 
+  // the central phone [position P] must be a real phone, i.e. 
+  // not -1. 
+  // This function inserts any allowed pairs (forward_pdf, self_loop_pdf) 
+  // to the set "pairs".
+  void EnumeratePairs(
+      const std::vector<int32> &phones,
+      int32 self_loop_pdf_class, int32 forward_pdf_class,
+      const std::vector<int32> &context,
+      unordered_set<std::pair<int32,int32>, PairHasher<int32> > *pairs)
+      const;
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(ContextDependency);
 };
 
diff --git a/src/util/kaldi-holder.cc b/src/util/kaldi-holder.cc
index ee7dd66e922..a26bdf2ce29 100644
--- a/src/util/kaldi-holder.cc
+++ b/src/util/kaldi-holder.cc
@@ -72,8 +72,8 @@ bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
 // template instantiation
 template bool ExtractObjectRange(const Matrix<double> &, const std::string &,
                                  Matrix<double> *);
-template bool ExtractObjectRange(const Matrix<BaseFloat> &, const std::string &,
-                                 Matrix<BaseFloat> *);
+template bool ExtractObjectRange(const Matrix<float> &, const std::string &,
+                                 Matrix<float> *);
 
 bool ExtractRangeSpecifier(const std::string &rxfilename_with_range,
                            std::string *data_rxfilename,
diff --git a/tools/Makefile b/tools/Makefile
index 548afafca1e..714e613e4bf 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -78,8 +78,12 @@ openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
 # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error
 # "file too big".
 openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched | check_required_programs
+# Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11).
 ifeq ($(OSTYPE),cygwin)
-	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
+	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
+# This new OS path is confirmed working on Windows 10 / Cygwin64
+else ifeq ($(OS),Windows_NT)
+	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 else
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
@@ -96,7 +100,7 @@ openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
 openfst-$(OPENFST_VERSION).tar.gz:
-	wget http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
+	wget --tries=1 -T 5 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
 	wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz
 
 sclite: sclite_compiled
@@ -172,5 +176,3 @@ openblas_compiled:
 	cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule
 	# $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install
 	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
-
-
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index b03020f292d..02145c7f0c8 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -36,6 +36,11 @@ else
   fi
 fi
 
+command -v swig >/dev/null 2>&1 || {
+  echo >&2 "$0: Error: I require swig but it's not installed.";
+  echo >&2 "  Please install swig and run this script again. "
+  exit 1;
+}
 
 if [ -d ./g2p ] || [ -d sequitur ] ; then
   echo  >&2 "$0: Warning: old installation of Sequitur found. You should manually"
@@ -59,7 +64,7 @@ if [ ! -d ./sequitur-g2p ] ; then
   }
 fi
 #just to retain backward compatibility for a while. Can be removed
-#in a couple of months. 
+#in a couple of months.
 ln -sf sequitur-g2p sequitur
 
 
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 564760a7353..a857f538edd 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -54,7 +54,7 @@ runvx cd tools
 runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
 cd ..
 runvx cd src
-runvx ./configure --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
+runvx ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
 
 make_kaldi() {
   runvx make "$@" $CCC EXTRA_CXXFLAGS="$CF" EXTRA_LDLIBS="$LDF"