diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
index d9437af7e0c..5943494d8e1 100755
--- a/egs/ami/s5/local/chain/run_blstm_ami_5.sh
+++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
@@ -118,7 +118,6 @@ if [ $stage -le 17 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 128 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -129,6 +128,7 @@ if [ $stage -le 17 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index f74a4ebaf6a..52b24e3a27c 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -48,6 +48,13 @@
 %WER 24.2 | 13098 94477 | 79.3 12.2 8.6 3.5 24.2 57.1 | -0.178 | exp/ihm/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires.ctm.filt.sys
 %WER 25.4 | 12643 89970 | 77.6 13.7 8.7 3.0 25.4 56.3 | -0.067 | exp/ihm/nnet3/tdnn_sp/decode_eval/ascore_12/eval_hires.ctm.filt.sys
 
+# local/nnet3/run_blstm.sh --mic ihm
+# nnet3 xent BLSTM with data cleaning
+# for d in exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
+%WER 22.3 | 13098 94494 | 80.9 11.7 7.4 3.2 22.3 55.7 | -0.618 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.5 | 12643 89962 | 80.2 12.7 7.1 2.7 22.5 53.4 | -0.476 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
 ############################################
 
 # local/chain/run_tdnn.sh --mic ihm --stage 12 &
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index a6c9d8192ec..7b1e56b5903 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -46,6 +46,12 @@
 %WER 41.6 | 14493 94516 | 63.3 23.5 13.2 4.9 41.6 66.8 | 0.639 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_dev/ascore_13/dev_hires_o4.ctm.filt.sys
 %WER 46.0 | 13597 89967 | 57.5 24.9 17.6 3.6 46.0 68.1 | 0.601 | exp/sdm1/nnet3/tdnn_sp_ihmali/decode_eval/ascore_14/eval_hires_o4.ctm.filt.sys
 
+# xent BLSTM system; cleaned data and IHM alignments.
+# local/nnet3/run_blstm.sh --mic sdm1 --use-ihm-ali true
+# for d in exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# Note: the results are with ClipGradientComponent, which may be different from with BackpropTruncationComponent
+%WER 37.8 | 14633 94518 | 67.1 22.3 10.7 4.9 37.8 64.2 | 0.745 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys
+%WER 41.4 | 13809 89628 | 62.7 24.1 13.2 4.1 41.4 65.2 | 0.723 | exp/sdm1/nnet3_cleaned/lstm_bidirectional_sp_ihmali/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys
 
 # =========================
 
diff --git a/egs/ami/s5b/local/nnet3/run_blstm.sh b/egs/ami/s5b/local/nnet3/run_blstm.sh
new file mode 100755
index 00000000000..776151fb5aa
--- /dev/null
+++ b/egs/ami/s5b/local/nnet3/run_blstm.sh
@@ -0,0 +1,52 @@
+stage=0
+train_stage=-10
+mic=ihm
+affix=bidirectional
+common_egs_dir=
+remove_egs=true
+use_ihm_ali=false
+train_set=train_cleaned
+ihm_gmm=tri3
+nnet3_affix=_cleaned
+
+# BLSTM params
+cell_dim=512
+rp_dim=128
+nrp_dim=128
+chunk_left_context=40
+chunk_right_context=40
+
+# training options
+srand=0
+num_jobs_initial=2
+num_jobs_final=12
+samples_per_iter=20000
+num_epochs=10
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+local/nnet3/run_lstm.sh --affix $affix \
+                         --stage $stage \
+                         --srand $srand \
+                         --train-stage $train_stage \
+                         --train-set $train_set \
+                         --ihm-gmm $ihm_gmm \
+                         --nnet3-affix $nnet3_affix \
+                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+                         --label-delay 0 \
+                         --cell-dim $cell_dim \
+                         --recurrent-projection-dim $rp_dim \
+                         --non-recurrent-projection-dim $nrp_dim \
+                         --common-egs-dir "$common_egs_dir" \
+                         --chunk-left-context $chunk_left_context \
+                         --chunk-right-context $chunk_right_context \
+                         --mic $mic \
+                         --num-jobs-initial $num_jobs_initial \
+                         --num-jobs-final $num_jobs_final \
+                         --samples-per-iter $samples_per_iter \
+                         --num-epochs $num_epochs \
+                         --use-ihm-ali $use_ihm_ali \
+                         --remove-egs $remove_egs
+
diff --git a/egs/ami/s5b/local/nnet3/run_lstm.sh b/egs/ami/s5b/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..c5583e2d0ef
--- /dev/null
+++ b/egs/ami/s5b/local/nnet3/run_lstm.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+#    This is the standard "lstm" system, built in nnet3.
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+
+# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true
+
+# local/nnet3/run_lstm.sh --mic ihm --stage 11
+# local/nnet3/run_lstm.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix "" &
+#
+# local/nnet3/run_lstm.sh --mic sdm1 --stage 11 --affix cleaned2 --gmm tri4a_cleaned2 --train-set train_cleaned2 &
+
+# local/nnet3/run_lstm.sh --use-ihm-ali true --mic sdm1 --train-set train --gmm tri3 --nnet3-affix "" &
+
+# local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 &
+
+#  local/nnet3/run_lstm.sh --use-ihm-ali true --mic mdm8 --train-set train --gmm tri3 --nnet3-affix "" &
+
+# this is an example of how you'd train a non-IHM system with the IHM
+# alignments.  the --gmm option in this case refers to the IHM gmm that's used
+# to get the alignments.
+# local/nnet3/run_lstm.sh --mic sdm1 --use-ihm-ali true --affix cleaned2 --gmm tri4a --train-set train_cleaned2 &
+
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+ihm_gmm=tri3      # Only relevant if $use_ihm_ali is true, the name of the gmm-dir in
+                  # the ihm directory that is to be used for getting alignments.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+train_stage=-10
+srand=0
+num_epochs=10
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=12
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+decode_iter=
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  maybe_ihm="IHM "
+  dir=exp/$mic/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+  if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+  dir=${dir}_sp_ihmali
+else
+  gmm_dir=exp/${mic}/${gmm}
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  maybe_ihm=
+  dir=exp/$mic/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+  if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+  dir=${dir}_sp
+fi
+
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$gmm_dir/graph_${LM}
+
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+         ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --trainer.rnn.num-bptt-steps 30 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  rm $dir/.error 2>/dev/null || true
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  model_opts=
+  [ ! -z $decode_iter ] && model_opts=" --iter $decode_iter ";
+  for decode_set in dev eval; do
+      (
+      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      decode_dir=${dir}/decode_${decode_set}
+      steps/nnet3/decode.sh --nj 250 --cmd "$decode_cmd" \
+          $model_opts \
+          --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
+      ) &
+  done
+  wait;
+  if [ -f $dir/.error ]; then
+    echo "$0: error detected during decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index 79d633b1ebd..522498d847d 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -176,7 +176,6 @@ if [ $stage -le 12 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 1.414 \
     --egs.stage $get_egs_stage \
@@ -193,6 +192,7 @@ if [ $stage -le 12 ]; then
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --cleanup.remove-egs $remove_egs \
     --feat-dir data/train_rvb_min${min_seg_len}_hires \
     --tree-dir $treedir \
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index 5fa4ea565cd..c11420e5cfd 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -173,7 +173,6 @@ if [ $stage -le 12 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 1.414 \
     --egs.stage $get_egs_stage \
@@ -188,6 +187,7 @@ if [ $stage -le 12 ]; then
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --cleanup.remove-egs $remove_egs \
     --feat-dir data/train_rvb_min${min_seg_len}_hires \
     --tree-dir $treedir \
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
index b70da4e852a..a48e7ed55af 100644
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -117,7 +117,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 1.414 \
@@ -128,6 +127,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/multi_en/s5/local/chain/run_blstm_6h.sh b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
index 51ca7db0495..5a68947282a 100644
--- a/egs/multi_en/s5/local/chain/run_blstm_6h.sh
+++ b/egs/multi_en/s5/local/chain/run_blstm_6h.sh
@@ -124,7 +124,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 1.414 \
@@ -135,6 +134,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh
deleted file mode 100755
index ded03563711..00000000000
--- a/egs/swbd/s5c/local/chain/compare_wer.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-
-
-echo -n "System               "
-for x in $*; do   printf "% 10s" $x;   done
-echo
-
-echo -n "WER on train_dev(tg) "
-for x in $*; do
-  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on train_dev(fg) "
-for x in $*; do
-  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on eval2000(tg)  "
-for x in $*; do
-  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "WER on eval2000(fg)  "
-for x in $*; do
-  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
-
-echo -n "Final train prob     "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final valid prob     "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final train prob (xent)    "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
-
-echo -n "Final valid prob (xent)    "
-for x in $*; do
-  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
-done
-echo
diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..c8aae0b3b94
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" $x;   done
+echo
+
+echo -n "WER on train_dev(tg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on train_dev(fg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(tg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(fg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh
new file mode 100755
index 00000000000..542dae82581
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer_tdnn.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+models=""
+for x in $*; do   models="$models tdnn_${x}";   done
+
+local/chain/compare_wer_general.sh $models
diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh
new file mode 120000
index 00000000000..0160247619f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm.sh
@@ -0,0 +1 @@
+tuning/run_blstm_6j.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_lstm.sh b/egs/swbd/s5c/local/chain/run_lstm.sh
index 28e5e6cc20c..8b421ac2649 120000
--- a/egs/swbd/s5c/local/chain/run_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_lstm.sh
@@ -1 +1 @@
-tuning/run_lstm_6i.sh
\ No newline at end of file
+tuning/run_lstm_6j.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index 669740d5f27..7b86453e14b 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7f.sh
\ No newline at end of file
+tuning/run_tdnn_7h.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
index 95f7aef2708..9ab72b40ac2 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h.sh
@@ -144,7 +144,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -155,6 +154,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
index 26cdaed29d7..6e1712c5187 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6i.sh
@@ -150,7 +150,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -161,6 +160,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
new file mode 100755
index 00000000000..496bf502491
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+# 6j is same as 6i but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+# Another minor change is that the final-affine component has param-stddev-0
+# and bias-stddev=0 initialization.
+# This run also accounts for changes in training due to the BackpropTruncationComponent
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6j  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
index fbced146199..3155e21b618 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6h.sh
@@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
index c5548cbfa5c..f1a42cc175c 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6i.sh
@@ -149,7 +149,6 @@ if [ $stage -le 13 ]; then
     --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.frames-per-iter 1200000 \
     --trainer.max-param-change 2.0 \
@@ -160,6 +159,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts "--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
new file mode 100755
index 00000000000..4c765d35d30
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# 6j is same as 6i but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+# Another minor change is that the final-affine component has param-stddev-0
+# and bias-stddev=0 initialization.
+
+
+
+# This run is affected by the bug that per-element-scale components do not have
+# max-change. The updated results without the bug will be submitted soon.
+#System               lstm_6i_ld5  lstm_6j_ld5
+#WER on train_dev(tg)      14.65     14.43
+#WER on train_dev(fg)      13.38     13.17
+#WER on eval2000(tg)        16.9      16.9
+#WER on eval2000(fg)        15.4      15.3
+#Final train prob     -0.0751668-0.0795697
+#Final valid prob     -0.0928206-0.0926466
+#Final train prob (xent)      -1.34549  -1.16067
+#Final valid prob (xent)      -1.41301  -1.23679
+#
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6j # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
index 28c20c92ab0..a678fe22044 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_d.sh
@@ -155,7 +155,6 @@ if [ $stage -le 13 ]; then
     --chain.xent-regularize $xent_regularize \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --chain.left-deriv-truncate 0 \
     --trainer.num-chunk-per-minibatch 64 \
     --trainer.max-param-change 2.0 \
     --trainer.num-epochs 4 \
@@ -165,6 +164,7 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 10 \
     --egs.stage $get_egs_stage \
     --egs.opts="--frames-overlap-per-eg 0" \
     --egs.chunk-width $chunk_width \
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
new file mode 100755
index 00000000000..7a4512097d3
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+
+# 7g is same as 7f but using the xconfig format of network specification.
+# Also, the model is trained without layer-wise discriminative pretraining.
+
+
+# System                  7f     7g
+# WER on train_dev(tg)    14.46  13.85
+# WER on train_dev(fg)    13.23  12.67
+# WER on eval2000(tg)     17.0   16.5
+# WER on eval2000(fg)     15.4   14.8
+# Final train prob     -0.0882071 -0.0885075
+# Final valid prob     -0.107545  -0.113462
+# Final train prob (xent) -1.26246 -1.25788
+# Final valid prob (xent) -1.35525 -1.37058
+
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7g  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_7e_sp/egs
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  # it doesn't make sense to have -6,0,6 splicing for a chain model
+  # as we compute a sequence of outputs and computation can be shared
+  # this has to be split into two -3,0,3 layers. But I will keep this
+  # to have same setup as 7f
+  relu-renorm-layer name=tdnn6 input=Append(-6,0,6) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
new file mode 100755
index 00000000000..00743ca9ebf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+#System                  tdnn_7g   tdnn_7h
+#WER on train_dev(tg)      13.98     13.84
+#WER on train_dev(fg)      12.78     12.84
+#WER on eval2000(tg)        16.7      16.5
+#WER on eval2000(fg)        14.9      14.8
+#Final train prob     -0.0817467-0.0889771
+#Final valid prob      -0.110475 -0.113102
+#Final train prob (xent)      -1.20065   -1.2533
+#Final valid prob (xent)       -1.3313  -1.36743
+#
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7h  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info exp/chain/tri5_7d_tree_sp/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=625
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 1908b390151..be984ac24ee 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -8,8 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
-. cmd.sh
+#
 
 
 stage=0
@@ -26,7 +25,7 @@ extra_right_context=40
 extra_left_context_initial=-1
 extra_right_context_final=-1
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
@@ -52,9 +51,9 @@ effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
+adjust_priors=true            # May need to be set to false
                               # because it does not help in some setups
 modify_learning_rates=true
 last_layer_factor=0.1
@@ -64,8 +63,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -102,7 +101,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -115,8 +114,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -129,7 +128,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -166,7 +165,7 @@ if [ $stage -le 4 ]; then
     --regularization-opts "$regularization_opts" \
     --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
     --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri4/graph_sw1_tg
@@ -176,7 +175,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
diff --git a/egs/tedlium/s5_r2/RESULTS b/egs/tedlium/s5_r2/RESULTS
index 9e694aadbd6..a8ac8f95627 100644
--- a/egs/tedlium/s5_r2/RESULTS
+++ b/egs/tedlium/s5_r2/RESULTS
@@ -73,6 +73,16 @@ for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep Sum $x/*ore*/*ys | utils/be
 %WER 11.9 | 507 17783 | 90.0 7.0 3.0 1.9 11.9 81.9 | -0.072 | exp/nnet3_cleaned/tdnn_sp/decode_dev_rescore/score_11_0.0/ctm.filt.filt.sys
 %WER 10.8 | 1155 27500 | 90.6 6.7 2.7 1.4 10.8 76.6 | -0.101 | exp/nnet3_cleaned/tdnn_sp/decode_test_rescore/score_11_0.0/ctm.filt.filt.sys
 
+# BLSTM small LM
+# The results are with ClipGradientComponent and without deriv_time fix, so it may not reflect the latest changes
+# for x in exp/nnet3_cleaned/lstm_bidirectional_sp/decode_*; do grep Sum $x/*ore*/*ys | utils/best_wer.sh; done
+%WER 11.1 | 507 17783 | 90.5 6.8 2.7 1.6 11.1 80.7 | -0.251 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_dev/score_10_0.0/ctm.filt.filt.sys
+%WER 10.2 | 1155 27500 | 91.0 6.4 2.6 1.2 10.2 75.5 | -0.278 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_test/score_10_0.0/ctm.filt.filt.sys
+
+# BLSTM large LM
+%WER 10.6 | 507 17783 | 91.0 6.5 2.5 1.6 10.6 79.3 | -0.275 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys
+%WER 9.9 | 1155 27500 | 91.3 6.1 2.6 1.2 9.9 74.1 | -0.306 | exp/nnet3_cleaned/lstm_bidirectional_sp/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
   # nnet3 results without cleanup, run with:
   # local/nnet3/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
   # This is only about 0.1% worse than the baseline with cleanup... the cleanup helps
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh
new file mode 100755
index 00000000000..32933d789a4
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_blstm.sh
@@ -0,0 +1,48 @@
+stage=0
+train_stage=-10
+affix=bidirectional
+common_egs_dir=
+remove_egs=true
+train_set=train_cleaned
+gmm=tri3_cleaned
+nnet3_affix=_cleaned
+
+# BLSTM params
+cell_dim=1024
+rp_dim=128
+nrp_dim=128
+chunk_left_context=40
+chunk_right_context=40
+
+# training options
+srand=0
+num_jobs_initial=3
+num_jobs_final=15
+samples_per_iter=20000
+num_epochs=6
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+local/nnet3/run_lstm.sh --affix $affix \
+                         --srand $srand \
+                         --stage $stage \
+                         --train-stage $train_stage \
+                         --train-set $train_set \
+                         --gmm $gmm \
+                         --nnet3-affix $nnet3_affix \
+                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+                         --label-delay 0 \
+                         --cell-dim $cell_dim \
+                         --recurrent-projection-dim $rp_dim \
+                         --non-recurrent-projection-dim $nrp_dim \
+                         --common-egs-dir "$common_egs_dir" \
+                         --chunk-left-context $chunk_left_context \
+                         --chunk-right-context $chunk_right_context \
+                         --num-jobs-initial $num_jobs_initial \
+                         --num-jobs-final $num_jobs_final \
+                         --samples-per-iter $samples_per_iter \
+                         --num-epochs $num_epochs \
+                         --remove-egs $remove_egs
+
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..5fbeb79991c
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_lstm.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+#    This is the standard "lstm" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# by default, with cleanup:
+# local/nnet3/run_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=15
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/__init__.py b/egs/wsj/s5/steps/libs/__init__.py
new file mode 100644
index 00000000000..2a472386568
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/__init__.py
@@ -0,0 +1,9 @@
+
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+""" This package contains modules and subpackages used in kaldi scripts.
+"""
+
+__all__ = ["common"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/__init__.py b/egs/wsj/s5/steps/libs/nnet3/__init__.py
new file mode 100644
index 00000000000..03131a3a8d6
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/__init__.py
@@ -0,0 +1,12 @@
+
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vimal Manohar
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+
+# This module has the python functions which facilitate the use of nnet3 toolkit
+# It has two sub-modules
+# xconfig : Library for parsing high level description of neural networks
+# train : Library for training scripts
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py
new file mode 100644
index 00000000000..6c824b1195b
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+"""This library has classes and methods to form neural network computation graphs,
+in the nnet3 framework, using higher level abstractions called 'layers'
+(e.g. sub-graphs like LSTMS ).
+
+Note : We use the term 'layer' though the computation graph can have a highly
+non-linear structure as, other terms such as nodes/components have already been
+used in C++ codebase of nnet3.
+
+This is basically a config parser module, where the configs have very concise
+descriptions of a neural network.
+
+This module has methods to convert the xconfigs into a configs interpretable by
+nnet3 C++ library.
+
+It generates three different configs:
+ 'init.config' : which is the config with the info necessary for computing
+               the preconditioning matrix i.e., LDA transform
+               e.g.
+                 input-node name=input dim=40
+                 input-node name=ivector dim=100
+                 output-node name=output input=Append(Offset(input, -2), Offset(input, -1), input, Offset(input, 1), Offset(input, 2), ReplaceIndex(ivector, t, 0)) objective=linear
+
+ 'ref.config' : which is a version of the config file used to generate
+                a model for getting left and right context (it doesn't read
+                anything for the LDA-like transform and/or
+                presoftmax-prior-scale components)
+
+ 'final.config' : which has the actual config used to initialize the model used
+                 in training i.e, it has file paths for LDA transform and
+                 other initialization files
+"""
+
+
+__all__ = ["utils", "layers", "parser"]
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
new file mode 100644
index 00000000000..35f19e5a626
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -0,0 +1,902 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+# Apache 2.0.
+
+""" This module contains the parent class from which all layers are inherited
+and some basic layer definitions.
+"""
+
+from __future__ import print_function
+import sys
+import libs.nnet3.xconfig.utils as xutils
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+class XconfigLayerBase(object):
+    """ A base-class for classes representing layers of xconfig files.
+    """
+
+    def __init__(self, first_token, key_to_value, all_layers):
+        """
+         first_token: first token on the xconfig line, e.g. 'affine-layer'.f
+         key_to_value: dictionary with parameter values
+             { 'name':'affine1',
+               'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))',
+               'dim=1024' }.
+             The only required and 'special' values that are dealt with directly
+             at this level, are 'name' and 'input'. The rest are put in
+             self.config and are dealt with by the child classes' init functions.
+         all_layers: An array of objects inheriting XconfigLayerBase for all
+                    previously parsed layers.
+        """
+
+        self.layer_type = first_token
+        if not 'name' in key_to_value:
+            raise xparser_error("Expected 'name' to be specified.", self.str())
+        self.name = key_to_value['name']
+        if not xutils.is_valid_line_name(self.name):
+            raise xparser_error("Invalid value: name={0}".format(
+                key_to_value['name']), self.str())
+
+        # the following, which should be overridden in the child class, sets
+        # default config parameters in self.config.
+        self.set_default_configs()
+        # The following is not to be reimplemented in child classes;
+        # it sets the config values to those specified by the user, and
+        # parses any Descriptors.
+        self.set_configs(key_to_value, all_layers)
+        # This method, sets the derived default config values
+        # i.e., config values when not specified can be derived from
+        # other values. It can be overridden in the child class.
+        self.set_derived_configs()
+        # the following, which should be overridden in the child class, checks
+        # that the config parameters that have been set are reasonable.
+        self.check_configs()
+
+
+    def set_configs(self, key_to_value, all_layers):
+        """ Sets the config variables.
+            We broke this code out of __init__ for clarity.
+            the child-class constructor will deal with the configuration values
+            in a more specific way.
+        """
+
+        for key,value in key_to_value.items():
+            if key != 'name':
+                if not key in self.config:
+                    raise xparser_error("Configuration value {0}={1} was not"
+                                        " expected in layer of type {2}"
+                                        "".format(key, value, self.layer_type),
+                                        self.str())
+                self.config[key] = xutils.convert_value_to_type(key,
+                                                                type(self.config[key]),
+                                                                value)
+        self.descriptors = dict()
+        self.descriptor_dims = dict()
+        # Parse Descriptors and get their dims and their 'final' string form.
+        # in self.descriptors[key]
+        for key in self.get_input_descriptor_names():
+            if not key in self.config:
+                raise xparser_error("{0}: object of type {1} needs to override"
+                                   " get_input_descriptor_names()."
+                                   "".format(sys.argv[0], str(type(self))),
+                                             self.str())
+            descriptor_string = self.config[key]  # input string.
+            assert isinstance(descriptor_string, str)
+            desc = self.convert_to_descriptor(descriptor_string, all_layers)
+            desc_dim = self.get_dim_for_descriptor(desc, all_layers)
+            desc_norm_str = desc.str()
+
+            # desc_output_str contains the "final" component names, those that
+            # appear in the actual config file (i.e. not names like
+            # 'layer.auxiliary_output'); that's how it differs from desc_norm_str.
+            # Note: it's possible that the two strings might be the same in
+            # many, even most, cases-- it depends whether
+            # output_name(self, auxiliary_output)
+            # returns self.get_name() + '.' + auxiliary_output
+            # when auxiliary_output is not None.
+            # That's up to the designer of the layer type.
+            desc_output_str = self.get_string_for_descriptor(desc, all_layers)
+            self.descriptors[key] = {'string':desc,
+                                     'normalized-string':desc_norm_str,
+                                     'final-string':desc_output_str,
+                                     'dim':desc_dim}
+
+            # the following helps to check the code by parsing it again.
+            desc2 = self.convert_to_descriptor(desc_norm_str, all_layers)
+            desc_norm_str2 = desc2.str()
+            # if the following ever fails we'll have to do some debugging.
+            if desc_norm_str != desc_norm_str2:
+                raise xparser_error("Likely code error: '{0}' != '{1}'"
+                                    "".format(desc_norm_str, desc_norm_str2),
+                                    self.str())
+
+    def str(self):
+        """Converts 'this' to a string which could be printed to
+        an xconfig file; in xconfig_to_configs.py we actually expand all the
+        lines to strings and write it as xconfig.expanded as a reference
+        (so users can see any defaults).
+        """
+
+        ans = '{0} name={1}'.format(self.layer_type, self.name)
+        ans += ' ' + ' '.join([ '{0}={1}'.format(key, self.config[key])
+                                for key in sorted(self.config.keys())])
+        return ans
+
+    def __str__(self):
+
+        return self.str()
+
+
+    def normalize_descriptors(self):
+        """Converts any config variables in self.config which correspond to
+        Descriptors, into a 'normalized form' derived from parsing them as
+        Descriptors, replacing things like [-1] with the actual layer names,
+        and regenerating them as strings.  We stored this when the object was
+        initialized, in self.descriptors; this function just copies them back
+        to the config.
+        """
+
+        for key, desc_str_dict in self.descriptors.items():
+            self.config[key] = desc_str_dict['normalized-string']
+
+    def convert_to_descriptor(self, descriptor_string, all_layers):
+        """Convenience function intended to be called from child classes,
+        converts a string representing a descriptor ('descriptor_string')
+        into an object of type Descriptor, and returns it. It needs 'self' and
+        'all_layers' (where 'all_layers' is a list of objects of type
+        XconfigLayerBase) so that it can work out a list of the names of other
+        layers, and get dimensions from them.
+        """
+
+        prev_names = xutils.get_prev_names(all_layers, self)
+        tokens = xutils.tokenize_descriptor(descriptor_string, prev_names)
+        pos = 0
+        (descriptor, pos) = xutils.parse_new_descriptor(tokens, pos, prev_names)
+        # note: 'pos' should point to the 'end of string' marker
+        # that terminates 'tokens'.
+        if pos != len(tokens) - 1:
+            raise xparser_error("Parsing Descriptor, saw junk at end: " +
+                            ' '.join(tokens[pos:-1]), self.str())
+        return descriptor
+
+    def get_dim_for_descriptor(self, descriptor, all_layers):
+        """Returns the dimension of a Descriptor object. This is a convenience
+        function used in set_configs.
+        """
+
+        layer_to_dim_func = \
+                lambda name: xutils.get_dim_from_layer_name(all_layers, self,
+                                                            name)
+        return descriptor.dim(layer_to_dim_func)
+
+    def get_string_for_descriptor(self, descriptor, all_layers):
+        """Returns the 'final' string form of a Descriptor object,
+        as could be used in config files. This is a convenience function
+        provided for use in child classes;
+        """
+
+        layer_to_string_func = \
+                lambda name: xutils.get_string_from_layer_name(all_layers,
+                                                               self, name)
+        return descriptor.config_string(layer_to_string_func)
+
+    def get_name(self):
+        """Returns the name of this layer, e.g. 'affine1'.  It does not
+        necessarily correspond to a component name.
+        """
+
+        return self.name
+
+    ######  Functions that might be overridden by the child class: #####
+
+    def set_default_configs(self):
+        """Child classes should override this.
+        """
+
+        raise Exception("Child classes must override set_default_configs().")
+
+    def set_derived_configs(self):
+        """This is expected to be called after set_configs and before
+        check_configs().
+        """
+
+        if self.config['dim'] <= 0:
+            self.config['dim'] = self.descriptors['input']['dim']
+
+    def check_configs(self):
+        """child classes should override this.
+        """
+
+        pass
+
+    def get_input_descriptor_names(self):
+        """This function, which may be (but usually will not have to be)
+        overridden by child classes, returns a list of names of the input
+        descriptors expected by this component. Typically this would just
+        return ['input'] as most layers just have one 'input'. However some
+        layers might require more inputs (e.g. cell state of previous LSTM layer
+        in Highway LSTMs). It is used in the function 'normalize_descriptors()'.
+        This implementation will work for layer types whose only
+        Descriptor-valued config is 'input'.
+        If a child class adds more inputs, or does not have an input
+        (e.g. the XconfigInputLayer), it should override this function's
+        implementation to something like: `return ['input', 'input2']`
+        """
+
+        return [ 'input' ]
+
+    def auxiliary_outputs(self):
+        """Returns a list of all auxiliary outputs that this layer supports.
+        These are either 'None' for the regular output, or a string
+        (e.g. 'projection' or 'memory_cell') for any auxiliary outputs that
+        the layer might provide.  Most layer types will not need to override
+        this.
+        """
+
+        return [ None ]
+
+    def output_name(self, auxiliary_output = None):
+        """Called with auxiliary_output == None, this returns the component-node
+        name of the principal output of the layer (or if you prefer, the text
+        form of a descriptor that gives you such an output; such as
+        Append(some_node, some_other_node)).
+        The 'auxiliary_output' argument is a text value that is designed for
+        extensions to layers that have additional auxiliary outputs.
+        For example, to implement a highway LSTM you need the memory-cell of a
+        layer, so you might allow auxiliary_output='memory_cell' for such a
+        layer type, and it would return the component node or a suitable
+        Descriptor: something like 'lstm3.c_t'
+        """
+
+        raise Exception("Child classes must override output_name()")
+
+    def output_dim(self, auxiliary_output = None):
+        """The dimension that this layer outputs.  The 'auxiliary_output'
+        parameter is for layer types which support auxiliary outputs.
+        """
+
+        raise Exception("Child classes must override output_dim()")
+
+    def get_full_config(self):
+        """This function returns lines destined for the 'full' config format, as
+        would be read by the C++ programs. Since the program
+        xconfig_to_configs.py writes several config files, this function returns
+        a list of pairs of the form (config_file_basename, line),
+        e.g. something like
+         [  ('init', 'input-node name=input dim=40'),
+            ('ref', 'input-node name=input dim=40') ]
+        which would be written to config_dir/init.config and config_dir/ref.config.
+        """
+
+        raise Exception("Child classes must override get_full_config()")
+
+
+class XconfigInputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'input name=input dim=40'
+    or
+    'input name=ivector dim=100'
+    in the config file.
+    """
+
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'input'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+
+    def set_default_configs(self):
+
+        self.config = { 'dim': -1}
+
+    def check_configs(self):
+
+        if self.config['dim'] <= 0:
+            raise xparser_error("Dimension of input-layer '{0}'"
+                                "should be positive.".format(self.name),
+                                self.str())
+
+    def get_input_descriptor_names(self):
+
+        return []  # there is no 'input' field in self.config.
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the input
+        assert auxiliary_outputs is None
+        return self.name
+
+    def output_dim(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the input
+        assert auxiliary_outputs is None
+        return self.config['dim']
+
+    def get_full_config(self):
+
+        # unlike other layers the input layers need to be printed in
+        # 'init.config' (which initializes the neural network prior to the LDA)
+        ans = []
+        for config_name in [ 'init', 'ref', 'final' ]:
+            ans.append( (config_name,
+                         'input-node name={0} dim={1}'.format(self.name,
+                                                              self.config['dim'])))
+        return ans
+
+
+
+class XconfigTrivialOutputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'output name=output input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
+    This is for outputs that are not really output "layers"
+    (there is no affine transform or nonlinearity), they just directly map to an
+    output-node in nnet3.
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'output'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]' }
+
+    def check_configs(self):
+
+        pass  # nothing to check; descriptor-parsing can't happen in this function.
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # there are no auxiliary outputs as this layer will just pass the output
+        # of the previous layer
+        assert auxiliary_outputs is None
+        return self.name
+
+    def output_dim(self, auxiliary_outputs = None):
+
+        assert auxiliary_outputs is None
+        # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string).
+        return self.descriptors['input']['dim']
+
+    def get_full_config(self):
+
+        # the input layers need to be printed in 'init.config' (which
+        # initializes the neural network prior to the LDA), in 'ref.config',
+        # which is a version of the config file used for getting left and right
+        # context (it doesn't read anything for the LDA-like transform and/or
+        # presoftmax-prior-scale components)
+        # In 'full.config' we write everything, this is just for reference,
+        # and also for cases where we don't use the LDA-like transform.
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'output-string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_str = self.descriptors['input']['final-string']
+
+        for config_name in ['init', 'ref', 'final' ]:
+            ans.append( (config_name,
+                         'output-node name={0} input={1}'.format(
+                        self.name, descriptor_final_str)))
+        return ans
+
+
+class XconfigOutputLayer(XconfigLayerBase):
+    """This class is for lines like
+    'output-layer name=output dim=4257 input=Append(input@-1, input@0, input@1, ReplaceIndex(ivector, t, 0))'
+    By default this includes a log-softmax component.  The parameters are
+    initialized to zero, asthis is best for output layers.
+
+    Parameters of the class, and their defaults:
+        input='[-1]'    :   Descriptor giving the input of the layer.
+        dim=None    :   Output dimension of layer, will normally equal the number of pdfs.
+        include-log-softmax=true    :   setting it to false will omit the
+            log-softmax component- useful for chain models.
+        objective-type=linear   :   the only other choice currently is
+            'quadratic', for use in regression problems
+        learning-rate-factor=1.0    :   Learning rate factor for the final
+            affine component, multiplies the standard learning rate. normally
+            you'll leave this as-is, but for xent regularization output layers
+            for chain models you'll want to set
+            learning-rate-factor=(0.5/xent_regularize),
+            normally learning-rate-factor=5.0 since xent_regularize is
+            normally 0.1.
+        presoftmax-scale-file=None  :   If set, a filename for a vector that
+            will be used to scale the output of the affine component before the
+            log-softmax (if include-log-softmax=true), or before the output
+            (if not).  This is helpful to avoid instability in training due to
+            some classes having much more data than others.  The way we normally
+            create this vector is to take the priors of the classes to the
+            power -0.25 and rescale them so the average is 1.0.  This factor
+            -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
+            the scripts this would normally be set to
+            config_dir/presoftmax_prior_scale.vec
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names = None):
+
+        assert first_token == 'output-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = {'input' : '[-1]',
+                       'dim' : -1,
+                       'include-log-softmax' : True,
+                            # this would be false for chain models
+                       'objective-type' : 'linear',
+                            # see Nnet::ProcessOutputNodeConfigLine in
+                            # nnet-nnet.cc for other options
+                       'learning-rate-factor' : 1.0,
+                       'presoftmax-scale-file' : '',
+                            # used in DNN (not RNN) training when using
+                            # frame-level objfns,
+                       'max-change' : 1.5,
+                       'param-stddev' : 0.0,
+                       'bias-stddev' : 0.0,
+                       'output-delay' : 0
+                      }
+
+    def check_configs(self):
+
+        if self.config['dim'] <= -1:
+            raise xparser_error("In output-layer, dim has invalid value {0}"
+                                "".format(self.config['dim']), self.str())
+
+        if self.config['objective-type'] != 'linear' and \
+                self.config['objective_type'] != 'quadratic':
+            raise xparser_error("In output-layer, objective-type has"
+                                " invalid value {0}"
+                                "".format(self.config['objective-type']),
+                                self.str())
+
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise xparser_error("In output-layer, learning-rate-factor has"
+                                " invalid value {0}"
+                                "".format(self.config['learning-rate-factor']),
+                                self.str())
+
+
+    # you cannot access the output of this layer from other layers... see
+    # comment in output_name for the reason why.
+    def auxiliary_outputs(self):
+
+        return []
+
+    def output_name(self, auxiliary_outputs = None):
+
+        # Note: nodes of type output-node in nnet3 may not be accessed in
+        # Descriptors, so calling this with auxiliary_outputs=None doesn't
+        # make sense.  But it might make sense to make the output of the softmax
+        # layer and/or the output of the affine layer available as inputs to
+        # other layers, in some circumstances.
+        # we'll implement that when it's needed.
+        raise xparser_error("Outputs of output-layer may not be used by other"
+                            " layers", self.str())
+
+    def output_dim(self, auxiliary_output = None):
+
+        # see comment in output_name().
+        raise xparser_error("Outputs of output-layer may not be used by other"
+                            " layers", self.str())
+
+    def get_full_config(self):
+
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+        objective_type = self.config['objective-type']
+        learning_rate_factor = self.config['learning-rate-factor']
+        include_log_softmax = self.config['include-log-softmax']
+        presoftmax_scale_file = self.config['presoftmax-scale-file']
+        param_stddev = self.config['param-stddev']
+        bias_stddev = self.config['bias-stddev']
+        output_delay = self.config['output-delay']
+        max_change = self.config['max-change']
+
+        # note: ref.config is used only for getting the left-context and
+        # right-context of the network;
+        # final.config is where we put the actual network definition.
+        for config_name in [ 'ref', 'final' ]:
+            # First the affine node.
+            line = ('component name={0}.affine'
+                    ' type=NaturalGradientAffineComponent'
+                    ' input-dim={1}'
+                    ' output-dim={2}'
+                    ' param-stddev={3}'
+                    ' bias-stddev={4}'
+                    ' max-change={5} '
+                    ''.format(self.name, input_dim, output_dim,
+                        param_stddev, bias_stddev, max_change) +
+                    ('learning-rate-factor={0} '.format(learning_rate_factor)
+                     if learning_rate_factor != 1.0 else ''))
+            ans.append((config_name, line))
+
+            line = ('component-node name={0}.affine'
+                    ' component={0}.affine input={1}'
+                    ''.format(self.name, descriptor_final_string))
+            ans.append((config_name, line))
+            cur_node = '{0}.affine'.format(self.name)
+
+            if presoftmax_scale_file is not '' and config_name == 'final':
+                # don't use the presoftmax-scale in 'ref.config' since that
+                # file won't exist at the time we evaluate it.
+                # (ref.config is used to find the left/right context).
+                line = ('component name={0}.fixed-scale'
+                        ' type=FixedScaleComponent scales={1}'
+                        ''.format(self.name, presoftmax_scale_file))
+                ans.append((config_name, line))
+
+                line = ('component-node name={0}.fixed-scale'
+                        ' component={0}.fixed-scale input={1}'
+                        ''.format(self.name, cur_node))
+                ans.append((config_name, line))
+                cur_node = '{0}.fixed-scale'.format(self.name)
+
+            if include_log_softmax:
+                line = ('component name={0}.log-softmax'
+                        ' type=LogSoftmaxComponent dim={1}'
+                        ''.format(self.name, output_dim))
+                ans.append((config_name, line))
+
+                line = ('component-node name={0}.log-softmax'
+                        ' component={0}.log-softmax input={1}'
+                        ''.format(self.name, cur_node))
+                ans.append((config_name, line))
+                cur_node = '{0}.log-softmax'.format(self.name)
+
+            if output_delay != 0:
+                cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
+
+            line = ('output-node name={0} input={1}'.format(self.name, cur_node))
+            ans.append((config_name, line))
+        return ans
+
+
+# This class is for parsing lines like
+#  'relu-renorm-layer name=layer1 dim=1024 input=Append(-3,0,3)'
+# or:
+#  'sigmoid-layer name=layer1 dim=1024 input=Append(-3,0,3)'
+# which specify addition of an affine component and a sequence of non-linearities.
+# Here, the name of the layer itself dictates the sequence of nonlinearities
+# that are applied after the affine component; the name should contain some
+# combination of 'relu', 'renorm', 'sigmoid' and 'tanh',
+# and these nonlinearities will be added along with the affine component.
+#
+# The dimension specified is the output dim; the input dim is worked out from the input descriptor.
+# This class supports only nonlinearity types that do not change the dimension; we can create
+# another layer type to enable the use p-norm and similar dimension-reducing nonlinearities.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer, e.g. 1024]
+#   self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
+#
+class XconfigBasicLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        # Here we just list some likely combinations.. you can just add any
+        # combinations you want to use, to this list.
+        print(first_token)
+        assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer',
+                                'tanh-layer' ]
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim':-1,
+                        'max-change' : 0.75,
+                        'bias-stddev' : 0,
+                        'param-stddev' : -1, # default value is derived
+                        'self-repair-scale' : 1.0e-05,
+                        'target-rms' : 1.0,
+                        'ng-affine-options' : ''}
+
+    def set_derived_configs(self):
+        super(XconfigBasicLayer, self).set_derived_configs()
+        if self.config['param-stddev'] < 0:
+            self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim']
+
+
+    def check_configs(self):
+        if self.config['dim'] < 0:
+            raise xparser_error("dim has invalid value {0}".format(self.config['dim']), self.str())
+        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
+            raise xparser_error("self-repair-scale has invalid value {0}".format(self.config['self-repair-scale']), self.str())
+        if self.config['target-rms'] < 0.0:
+            raise xparser_error("target-rms has invalid value {0}".format(self.config['target-rms']), self.str())
+
+    def output_name(self, auxiliary_output=None):
+        # at a later stage we might want to expose even the pre-nonlinearity
+        # vectors
+        assert auxiliary_output == None
+
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        last_nonlinearity = split_layer_name[-2]
+        # return something like: layer3.renorm
+        return '{0}.{1}'.format(self.name, last_nonlinearity)
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+        return output_dim
+
+    def get_full_config(self):
+
+        ans = []
+
+        split_layer_name = self.layer_type.split('-')
+        assert split_layer_name[-1] == 'layer'
+        nonlinearities = split_layer_name[:-1]
+
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+        self_repair_scale = self.config['self-repair-scale']
+        target_rms = self.config['target-rms']
+        param_stddev = self.config['param-stddev']
+        bias_stddev = self.config['bias-stddev']
+        max_change = self.config['max-change']
+        ng_opt_str = self.config['ng-affine-options']
+
+        for config_name in [ 'ref', 'final' ]:
+            # First the affine node.
+            line = ('component name={0}.affine'
+                    ' type=NaturalGradientAffineComponent'
+                    ' input-dim={1}'
+                    ' output-dim={2}'
+                    ' param-stddev={3}'
+                    ' bias-stddev={4}'
+                    ' max-change={5}'
+                    ' {6}'
+                    ''.format(self.name, input_dim, output_dim,
+                        param_stddev, bias_stddev, max_change, ng_opt_str))
+            ans.append((config_name, line))
+
+            line = ('component-node name={0}.affine'
+                    ' component={0}.affine input={1}'
+                    ''.format(self.name, descriptor_final_string))
+            ans.append((config_name, line))
+            cur_node = '{0}.affine'.format(self.name)
+
+            for nonlinearity in nonlinearities:
+                if nonlinearity == 'relu':
+                    line = ('component name={0}.{1}'
+                            ' type=RectifiedLinearComponent dim={2}'
+                            ' self-repair-scale={3}'
+                            ''.format(self.name, nonlinearity, output_dim,
+                                self_repair_scale))
+
+                elif nonlinearity == 'sigmoid':
+                    line = ('component name={0}.{1}'
+                            ' type=SigmoidComponent dim={2}'
+                            ' self-repair-scale={3}'
+                            ''.format(self.name, nonlinearity, output_dim,
+                                self_repair_scale))
+
+                elif nonlinearity == 'tanh':
+                    line = ('component name={0}.{1}'
+                            ' type=TanhComponent dim={2}'
+                            ' self-repair-scale={3}'
+                            ''.format(self.name, nonlinearity, output_dim,
+                                self_repair_scale))
+
+                elif nonlinearity == 'renorm':
+                    line = ('component name={0}.{1}'
+                            ' type=NormalizeComponent dim={2}'
+                            ' target-rms={3}'
+                            ''.format(self.name, nonlinearity, output_dim,
+                                target_rms))
+
+                else:
+                    raise xparser_error("Unknown nonlinearity type:"
+                            "{0}".format(nonlinearity), self.str())
+
+                ans.append((config_name, line))
+                line = ('component-node name={0}.{1}'
+                        ' component={0}.{1} input={2}'
+                        ''.format(self.name, nonlinearity, cur_node))
+
+                ans.append((config_name, line))
+                cur_node = '{0}.{1}'.format(self.name, nonlinearity)
+        return ans
+
+
+# This class is for lines like
+#  'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'
+#
+# The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.  Note: we don't attempt to read that
+# file at the time the config is created, because in the recipes, that file is created
+# after the config files.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
+#   affine-transform-file='' [Must be specified.]
+#
+class XconfigFixedAffineLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == 'fixed-affine-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = { 'input':'[-1]',
+                        'dim':-1,
+                        'affine-transform-file':''}
+
+    def check_configs(self):
+        if self.config['affine-transform-file'] is None:
+            raise xparser_error("affine-transform-file must be set.", self.str())
+
+    def output_name(self, auxiliary_output = None):
+        # Fixed affine layer computes only one vector, there are no intermediate
+        # vectors.
+        assert auxiliary_output == None
+        return self.name
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+        transform_file = self.config['affine-transform-file']
+
+
+        # to init.config we write an output-node with the name 'output' and
+        # with a Descriptor equal to the descriptor that's the input to this
+        # layer.  This will be used to accumulate stats to learn the LDA transform.
+        line = 'output-node name=output input={0}'.format(descriptor_final_string)
+        ans.append(('init', line))
+
+        # write the 'real' component to final.config
+        line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
+            self.name, transform_file)
+        ans.append(('final', line))
+        # write a random version of the component, with the same dims, to ref.config
+        line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format(
+            self.name, input_dim, output_dim)
+        ans.append(('ref', line))
+        # the component-node gets written to final.config and ref.config.
+        line = 'component-node name={0} component={0} input={1}'.format(
+            self.name, descriptor_final_string)
+        ans.append(('final', line))
+        ans.append(('ref', line))
+        return ans
+
+# This class is for lines like
+#  'affine-layer name=affine input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0))'
+#
+# The output dimension of the layer may be specified via 'dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.  Note: we don't attempt to read that
+# file at the time the config is created, because in the recipes, that file is created
+# after the config files.
+#
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
+#
+class XconfigAffineLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == 'affine-layer'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        # use None for optional parameters as we want to default to the C++ defaults
+        # C++ component provides more options but I will just expose these for now
+        # Note : The type of the parameter is determined based on the value assigned
+        #        so please use decimal point if your parameter is a float
+        self.config = { 'input' : '[-1]',
+                        'dim' : -1,
+                        'param-stddev' : -1.0, # this has to be initialized to 1/sqrt(input_dim)
+                        'bias-stddev' : 1.0,
+                        'bias-mean' : 0.0,
+                        'max-change' : 0.75,
+                        'learning-rate-factor' : 1.0,
+                        'ng-affine-options' : ''}
+
+    def set_derived_configs(self):
+        super(XconfigAffineLayer, self).set_derived_configs()
+        if self.config['param-stddev'] < 0:
+            self.config['param-stddev'] = 1.0 / self.descriptors['input']['dim']
+
+    def check_configs(self):
+        if self.config['dim'] <= 0:
+            raise xparser_error("dim specified is invalid".format(self.name, self.layer_type), self.str())
+
+    def output_name(self, auxiliary_output = None):
+        # affine layer computes only one vector, there are no intermediate
+        # vectors.
+        assert auxiliary_output == None
+        return self.name
+
+    def output_dim(self, auxiliary_output = None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            output_dim = self.descriptors['input']['dim']
+
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+
+        # note: each value of self.descriptors is (descriptor, dim,
+        # normalized-string, output-string).
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        descriptor_final_string = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+
+        option_string=''
+        for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change']:
+            option_string += ' {0}={1}'.format(key, self.config[key])
+        option_string += self.config['ng-affine-options']
+
+        conf_lines = []
+        # write the 'real' component to final.config
+        conf_lines.append('component name={n} type=NaturalGradientAffineComponent '
+                          'input-dim={i} output-dim={o} {opts}'.format(n = self.name,
+                                                                       i = input_dim,
+                                                                       o = output_dim,
+                                                                       opts = option_string))
+        # the component-node gets written to final.config and ref.config.
+        conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name,
+                                                                                   descriptor_final_string))
+
+        # the config is same for both final and ref configs
+        for conf_name in ['final', 'ref']:
+            for line in conf_lines:
+                ans.append((conf_name, line))
+        return ans
+
+
+def test_layers():
+    # for some config lines that should be printed the same way as they
+    # are read, check that this is the case.
+    for x in [ 'input name=input dim=30' ]:
+        assert str(config_line_to_object(x, [])) == x
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
new file mode 100644
index 00000000000..fa356d15a18
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -0,0 +1,7 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+from basic_layers import *
+from lstm import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
new file mode 100644
index 00000000000..7b37958f81b
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -0,0 +1,532 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2016    Yiming Wang
+# Apache 2.0.
+
+
+""" This module has the implementations of different LSTM layers.
+"""
+import re
+
+from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+# This class is for lines like
+#   'lstm-layer name=lstm1 input=[-1] delay=-3'
+# It generates an LSTM sub-graph without output projections.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigLstmLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "lstm-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'clipping-threshold' : 30.0,
+                        'norm-based-clipping' : True,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+    def check_configs(self):
+        key = 'cell-dim'
+        if self.config['cell-dim'] <= 0:
+            raise xparser_error("cell-dim has invalid value {0}.".format(self.config[key]), self.str())
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'm_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise xparser_error("Unknown auxiliary output name {0}".format(auxiliary_output), self.str())
+
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', ng_per_element_scale_options) is None and \
+           re.search('param-stddev', ng_per_element_scale_options) is None:
+           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+        pes_str = ng_per_element_scale_options
+
+
+
+        configs = []
+
+        # the equations implemented here are
+        # TODO: write these
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Input gate control : W_i* matrices")
+        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Forget gate control : W_f* matrices")
+        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("#  Output gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Cell input matrices : W_c* matrices")
+        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+
+        configs.append("# i_t")
+        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        configs.append("# f_t")
+        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        configs.append("# h_t")
+        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        configs.append("# g_t")
+        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name))
+
+        return configs
+
+
+# This class is for lines like
+#   'lstmp-layer name=lstm1 input=[-1] delay=-3'
+# It generates an LSTM sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigLstmLayer for this
+# simple LSTM.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent_projection_dim [Dimension of the projection used in recurrent connections]
+#   non_recurrent_projection_dim        [Dimension of the projection in non-recurrent connections]
+#   delay=-1                 [Delay in the recurrent connections of the LSTM ]
+#   clipping-threshold=30    [nnet3 LSTMs use a gradient clipping component at the recurrent connections. This is the threshold used to decide if clipping has to be activated ]
+#   norm-based-clipping=True [specifies if the gradient clipping has to activated based on total norm or based on per-element magnitude]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
+#   ng-affine-options=''              [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigLstmpLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        print first_token
+        assert first_token == "lstmp-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,
+                        'non-recurrent-projection-dim' : -1,
+                        'clipping-threshold' : 30.0,
+                        'norm-based-clipping' : True,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 3.0
+                       }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.InputDim()
+
+        for key in ['recurrent-projection-dim', 'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                self.config[key] = self.config['cell-dim'] / 2
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim', 'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise xparser_error("{0} has invalid value {1}.".format(key, self.config[key]), self.str())
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise xparser_error("{0} has invalid value {2}.".format(self.layer_type,
+                                                                               key,
+                                                                               self.config[key]))
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'rp_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_lstm_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        configs = []
+        # the equations implemented here are from Sak et. al. "Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
+        # http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Input gate control : W_i* matrices")
+        configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Forget gate control : W_f* matrices")
+        configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("#  Output gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Cell input matrices : W_c* matrices")
+        configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+
+        recurrent_connection = '{0}.r_t'.format(name)
+        configs.append("# i_t")
+        configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        configs.append("# f_t")
+        configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        configs.append("# h_t")
+        configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        configs.append("# g_t")
+        configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("# projection matrices : Wrm and Wpm")
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+
+        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+
+        return configs
+
+# Same as the LSTMP layer except that the matrix multiplications are combined
+# we probably keep only version after experimentation. One year old experiments
+# show that this version is slightly worse and might require some tuning
+class XconfigLstmpcLayer(XconfigLstmpLayer):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "lstmpc-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    # convenience function to generate the LSTM config
+    def generate_lstm_config(self):
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', ng_per_element_scale_options) is None and \
+           re.search('param-stddev', ng_per_element_scale_options) is None:
+           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+        pes_str = ng_per_element_scale_options
+
+        configs = []
+        # naming convention
+        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
+        configs.append("# Full W_ifoc* matrix")
+        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+
+        # we will not combine the diagonal matrix operations as one of these has a different delay
+        configs.append("# note : the cell outputs pass through a diagonal matrix")
+        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
+
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        configs.append("# Defining the components for other cell computations")
+        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+
+        # c1_t and c2_t defined below
+        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
+        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
+        rec_connection = '{0}.rp_t'.format(name)
+
+        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+
+
+        offset = 0
+        component_nodes.append("# i_t")
+        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+
+        component_nodes.append("# f_t")
+        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
+        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+
+        component_nodes.append("# o_t")
+        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
+        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+
+        component_nodes.append("# h_t")
+        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
+
+        component_nodes.append("# g_t")
+        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
+        offset += cell_dim
+        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
+
+
+        configs.append("# parts of c_t")
+        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
+        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
+
+        configs.append("# m_t")
+        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
+
+        # add the recurrent connections
+        configs.append("# projection matrices : Wrm and Wpm")
+        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, affine_str))
+        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, recurrent_projection_dim, bptrunc_str))
+
+        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
+        configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
+
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
new file mode 100644
index 00000000000..7aacba1ee8f
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -0,0 +1,90 @@
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+# Apache 2.0.
+
+""" This module contains the top level xconfig parsing functions.
+"""
+
+import libs.nnet3.xconfig.layers as xlayers
+import libs.nnet3.xconfig.utils as xutils
+from libs.nnet3.xconfig.utils import XconfigParserError as xparser_error
+
+
+# We have to modify this dictionary when adding new layers
+config_to_layer = {
+        'input' : xlayers.XconfigInputLayer,
+        'output' : xlayers.XconfigTrivialOutputLayer,
+        'output-layer' : xlayers.XconfigOutputLayer,
+        'relu-layer' : xlayers.XconfigBasicLayer,
+        'relu-renorm-layer' : xlayers.XconfigBasicLayer,
+        'sigmoid-layer' : xlayers.XconfigBasicLayer,
+        'tanh-layer' : xlayers.XconfigBasicLayer,
+        'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
+        'affine-layer' : xlayers.XconfigAffineLayer,
+        'lstm-layer' : xlayers.XconfigLstmLayer,
+        'lstmp-layer' : xlayers.XconfigLstmpLayer,
+        'lstmpc-layer' : xlayers.XconfigLstmpcLayer
+        }
+
+# Converts a line as parsed by ParseConfigLine() into a first
+# token e.g. 'input-layer' and a key->value map, into
+# an objet inherited from XconfigLayerBase.
+# 'prev_names' is a list of previous layer names, it's needed
+# to parse things like '[-1]' (meaning: the previous layer)
+# when they appear in Desriptors.
+def parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names):
+
+    conf_line = first_token + ' ' + ' '.join(['{0}={1}'.format(x,y) for x,y in key_to_value.items()])
+
+    if not config_to_layer.has_key(first_token):
+        raise xparser_error("No such layer type.", conf_line)
+
+    try:
+        return config_to_layer[first_token](first_token, key_to_value, prev_names)
+    except xparser_error as e:
+        if e.conf_line is None:
+            # we want to throw informative errors which point to the xconfig line
+            e.conf_line = conf_line
+        raise
+
+# Uses ParseConfigLine() to turn a config line that has been parsed into
+# a first token e.g. 'affine-layer' and a key->value map like { 'dim':'1024', 'name':'affine1' },
+# and then turns this into an object representing that line of the config file.
+# 'prev_names' is a list of the names of preceding lines of the
+# config file.
+def config_line_to_object(config_line, prev_names = None):
+    (first_token, key_to_value) = xutils.parse_config_line(config_line)
+    return parsed_line_to_xconfig_layer(first_token, key_to_value, prev_names)
+
+# This function reads an xconfig file and returns it as a list of layers
+# (usually we use the variable name 'all_layers' elsewhere for this).
+# It will die if the xconfig file is empty or if there was
+# some error parsing it.
+def read_xconfig_file(xconfig_filename):
+    try:
+        f = open(xconfig_filename, 'r')
+    except Exception as e:
+        sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format(
+            sys.argv[0], xconfig_filename, repr(e)))
+    all_layers = []
+    while True:
+        line = f.readline()
+        if line == '':
+            break
+        x = xutils.parse_config_line(line)
+        if x is None:
+            continue   # line was blank or only comments.
+        (first_token, key_to_value) = x
+        # the next call will raise an easy-to-understand exception if
+        # it fails.
+        this_layer = parsed_line_to_xconfig_layer(first_token,
+                                                  key_to_value,
+                                                  all_layers)
+        all_layers.append(this_layer)
+    if len(all_layers) == 0:
+        raise xparser_error("{0}: xconfig file '{1}' is empty".format(
+            sys.argv[0], xconfig_filename))
+    f.close()
+    return all_layers
+
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
new file mode 100644
index 00000000000..87c9d880089
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
@@ -0,0 +1,615 @@
+# Copyright  2016  Johns Hopkins University (Author: Daniel Povey).
+# License: Apache 2.0.
+
+# This library contains various utilities that are involved in processing
+# of xconfig -> config conversion.  It contains "generic" lower-level code
+# while xconfig_layers.py contains the code specific to layer types.
+
+from __future__ import print_function
+import re
+import sys
+
+
+class XconfigParserError(RuntimeError):
+    def __init__(self, error_msg, conf_line=None):
+        self.conf_line = conf_line
+        if conf_line is not None:
+            self.msg = 'While parsing "{c}" :{e}'.format(c=conf_line, e=error_msg)
+        else:
+            self.msg = error_msg
+
+    def __str__(self):
+        return self.msg
+
+# [utility function used in xconfig_layers.py]
+# Given a list of objects of type XconfigLayerBase ('all_layers'),
+# including at least the layers preceding 'current_layer' (and maybe
+# more layers), return the names of layers preceding 'current_layer'
+# This will be used in parsing expressions like [-1] in descriptors
+# (which is an alias for the previous layer).
+def get_prev_names(all_layers, current_layer):
+    prev_names = []
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        prev_names.append(layer.get_name())
+    prev_names_set = set()
+    for name in prev_names:
+        if name in prev_names_set:
+            raise XconfigParserError("{0}: Layer name {1} is used more than once.".format(
+                    sys.argv[0], name), current_layer.str())
+        prev_names_set.add(name)
+    return prev_names
+
+
+# This is a convenience function to parser the auxiliary output name from the
+# full layer name
+
+def split_layer_name(full_layer_name):
+    assert isinstance(full_layer_name, str)
+    split_name = full_layer_name.split('.')
+    if len(split_name) == 0:
+        raise XconfigParserError("Bad layer name: " + full_layer_name)
+    layer_name = split_name[0]
+    if len(split_name) == 1:
+        auxiliary_output = None
+    else:
+        # we probably expect len(split_name) == 2 in this case,
+        # but no harm in allowing dots in the auxiliary_output.
+        auxiliary_output = '.'.join(split_name[1:])
+
+    return [layer_name, auxiliary_output]
+
+# [utility function used in xconfig_layers.py]
+# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
+# 'lstm2.memory_cell', into a dimension.  'all_layers' is a vector of objects
+# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
+# function can make sure not to look in layers that appear *after* this layer
+# (because that's not allowed).
+def get_dim_from_layer_name(all_layers, current_layer, full_layer_name):
+    layer_name, auxiliary_output = split_layer_name(full_layer_name)
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        if layer.get_name() == layer_name:
+            if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
+                raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(layer_name, auxiliary_output), layer.str())
+            return layer.output_dim(auxiliary_output)
+    # No such layer was found.
+    if layer_name in [ layer.get_name() for layer in all_layers ]:
+        raise XconfigParserError("Layer '{0}' was requested before it appeared in "
+                        "the xconfig file (circular dependencies or out-of-order "
+                        "layers".format(layer_name))
+    else:
+        raise XconfigParserError("No such layer: '{0}'".format(layer_name))
+
+
+# [utility function used in xconfig_layers.py]
+# this converts a layer-name like 'ivector' or 'input', or a sub-layer name like
+# 'lstm2.memory_cell', into a descriptor (usually, but not required to be a simple
+# component-node name) that can appear in the generated config file.  'all_layers' is a vector of objects
+# inheriting from XconfigLayerBase.  'current_layer' is provided so that the
+# function can make sure not to look in layers that appear *after* this layer
+# (because that's not allowed).
+def get_string_from_layer_name(all_layers, current_layer, full_layer_name):
+    layer_name, auxiliary_output = split_layer_name(full_layer_name)
+    for layer in all_layers:
+        if layer is current_layer:
+            break
+        if layer.get_name() == layer_name:
+            if not auxiliary_output in layer.auxiliary_outputs() and auxiliary_output is not None:
+                raise XconfigParserError("Layer '{0}' has no such auxiliary output: '{1}' ({0}.{1})".format(
+                    layer_name, auxiliary_output))
+            return layer.output_name(auxiliary_output)
+    # No such layer was found.
+    if layer_name in [ layer.get_name() for layer in all_layers ]:
+        raise XconfigParserError("Layer '{0}' was requested before it appeared in "
+                        "the xconfig file (circular dependencies or out-of-order "
+                        "layers".format(layer_name))
+    else:
+        raise XconfigParserError("No such layer: '{0}'".format(layer_name))
+
+
+# This function, used in converting string values in config lines to
+# configuration values in self.config in layers, attempts to
+# convert 'string_value' to an instance dest_type (which is of type Type)
+# 'key' is only needed for printing errors.
+def convert_value_to_type(key, dest_type, string_value):
+    if dest_type == type(bool()):
+        if string_value == "True" or string_value == "true":
+            return True
+        elif string_value == "False" or string_value == "false":
+            return False
+        else:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected bool)".format(
+                key, string_value))
+    elif dest_type == type(int()):
+        try:
+            return int(string_value)
+        except:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format(
+                key, string_value))
+    elif dest_type == type(float()):
+        try:
+            return float(string_value)
+        except:
+            raise XconfigParserError("Invalid configuration value {0}={1} (expected int)".format(
+                key, string_value))
+    elif dest_type == type(str()):
+        return string_value
+
+
+
+# This class parses and stores a Descriptor-- expression
+# like Append(Offset(input, -3), input) and so on.
+# For the full range of possible expressions, see the comment at the
+# top of src/nnet3/nnet-descriptor.h.
+# Note: as an extension to the descriptor format used in the C++
+# code, we can have e.g. input@-3 meaning Offset(input, -3);
+# and if bare integer numbers appear where a descriptor was expected,
+# they are interpreted as Offset(prev_layer, -3) where 'prev_layer'
+# is the previous layer in the config file.
+
+# Also, in any place a raw input/layer/output name can appear, we accept things
+# like [-1] meaning the previous input/layer/output's name, or [-2] meaning the
+# last-but-one input/layer/output, and so on.
+class Descriptor:
+    def __init__(self,
+                 descriptor_string = None,
+                 prev_names = None):
+        # self.operator is a string that may be 'Offset', 'Append',
+        # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round',
+        # 'ReplaceIndex'; it also may be None, representing the base-case
+        # (where it's just a layer name)
+
+        # self.items will be whatever items are
+        # inside the parentheses, e.g. if this is Sum(foo bar),
+        # then items will be [d1, d2], where d1 is a Descriptor for
+        # 'foo' and d1 is a Descriptor for 'bar'.  However, there are
+        # cases where elements of self.items are strings or integers,
+        # for instance in an expression 'ReplaceIndex(ivector, x, 0)',
+        # self.items would be [d, 'x', 0], where d is a Descriptor
+        # for 'ivector'.  In the case where self.operator is None (where
+        # this Descriptor represents just a bare layer name), self.
+        # items contains the name of the input layer as a string.
+        self.operator = None
+        self.items = None
+
+        if descriptor_string != None:
+            try:
+                tokens = tokenize_descriptor(descriptor_string, prev_names)
+                pos = 0
+                (d, pos) = parse_new_descriptor(tokens, pos, prev_names)
+                # note: 'pos' should point to the 'end of string' marker
+                # that terminates 'tokens'.
+                if pos != len(tokens) - 1:
+                    raise XconfigParserError("Parsing Descriptor, saw junk at end: " +
+                                    ' '.join(tokens[pos:-1]))
+                # copy members from d.
+                self.operator = d.operator
+                self.items = d.items
+            except XconfigParserError as e:
+                traceback.print_tb(sys.exc_info()[2])
+                raise XconfigParserError("Error parsing Descriptor '{0}', specific error was: {1}".format(
+                    descriptor_string, repr(e)))
+
+    # This is like the str() function, but it uses the layer_to_string function
+    # (which is a function from strings to strings) to convert layer names (or
+    # in general sub-layer names of the form 'foo.bar') to the component-node
+    # (or, in general, descriptor) names that appear in the final config file.
+    # This mechanism gives those designing layer types the freedom to name their
+    # nodes as they want.
+    def config_string(self, layer_to_string):
+        if self.operator is None:
+            assert len(self.items) == 1 and isinstance(self.items[0], str)
+            return layer_to_string(self.items[0])
+        else:
+            assert isinstance(self.operator, str)
+            return self.operator + '(' + ', '.join(
+                    [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item)
+                      for item in self.items]) + ')'
+
+    def str(self):
+        if self.operator is None:
+            assert len(self.items) == 1 and isinstance(self.items[0], str)
+            return self.items[0]
+        else:
+            assert isinstance(self.operator, str)
+            return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')'
+
+    def __str__(self):
+        return self.str()
+
+    # This function returns the dimension (i.e. the feature dimension) of the
+    # descriptor.  It takes 'layer_to_dim' which is a function from
+    # layer-names (including sub-layer names, like lstm1.memory_cell) to
+    # dimensions, e.g. you might have layer_to_dim('ivector') = 100, or
+    # layer_to_dim('affine1') = 1024.
+    # note: layer_to_dim will raise an exception if a nonexistent layer or
+    # sub-layer is requested.
+    def dim(self, layer_to_dim):
+        if self.operator is None:
+            # base-case: self.items = [ layer_name ] (or sub-layer name, like
+            # 'lstm.memory_cell').
+            return layer_to_dim(self.items[0])
+        elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]:
+            # these are all operators for which all args are descriptors
+            # and must have the same dim.
+            dim = self.items[0].dim(layer_to_dim)
+            for desc in self.items[1:]:
+                next_dim = desc.dim(layer_to_dim)
+                if next_dim != dim:
+                    raise XparserError("In descriptor {0}, different fields have different "
+                                        "dimensions: {1} != {2}".format(self.str(), dim, next_dim))
+            return dim
+        elif self.operator in [  'Offset', 'Round', 'ReplaceIndex' ]:
+            # for these operators, only the 1st arg is relevant.
+            return self.items[0].dim(layer_to_dim)
+        elif self.operator == 'Append':
+            return sum([ x.dim(layer_to_dim) for x in self.items])
+        else:
+            raise XconfigParserError("Unknown operator {0}".format(self.operator))
+
+
+
+# This just checks that seen_item == expected_item, and raises an
+# exception if not.
+def expect_token(expected_item, seen_item, what_parsing):
+    if seen_item != expected_item:
+        raise XconfigParserError("parsing {0}, expected '{1}' but got '{2}'".format(
+            what_parsing, expected_item, seen_item))
+
+# returns true if 'name' is valid as the name of a line (input, layer or output);
+# this is the same as IsValidname() in the nnet3 code.
+def is_valid_line_name(name):
+    return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None
+
+# This function for parsing Descriptors takes an array of tokens as produced
+# by tokenize_descriptor.  It parses a descriptor
+# starting from position pos >= 0 of the array 'tokens', and
+# returns a new position in the array that reflects any tokens consumed while
+# parsing the descriptor.
+# It returns a pair (d, pos) where d is the newly parsed Descriptor,
+# and 'pos' is the new position after consuming the relevant input.
+# 'prev_names' is so that we can find the most recent layer name for
+# expressions like Append(-3, 0, 3) which is shorthand for the most recent
+# layer spliced at those time offsets.
+def parse_new_descriptor(tokens, pos, prev_names):
+    size = len(tokens)
+    first_token = tokens[pos]
+    pos += 1
+    d = Descriptor()
+
+    # when reading this function, be careful to note the indent level,
+    # there is an if-statement within an if-statement.
+    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+        expect_token('(', tokens[pos], first_token + '()')
+        pos += 1
+        d.operator = first_token
+        # the 1st argument of all these operators is a Descriptor.
+        (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+        d.items = [desc]
+
+        if first_token == 'Offset':
+            expect_token(',', tokens[pos], 'Offset()')
+            pos += 1
+            try:
+                t_offset = int(tokens[pos])
+                pos += 1
+                d.items.append(t_offset)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            if tokens[pos] == ')':
+                return (d, pos + 1)
+            elif tokens[pos] != ',':
+                raise XconfigParserError("Parsing Offset(), expected ')' or ',', got " + tokens[pos])
+            pos += 1
+            try:
+                x_offset = int(tokens[pos])
+                pos += 1
+                d.items.append(x_offset)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'Offset()')
+            pos += 1
+        elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+            while True:
+                if tokens[pos] == ')':
+                    # check num-items is correct for some special cases.
+                    if first_token == 'Failover' and len(d.items) != 2:
+                        raise XconfigParserError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items)))
+                    if first_token == 'IfDefined' and len(d.items) != 1:
+                        raise XconfigParserError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items)))
+                    pos += 1
+                    break
+                elif tokens[pos] == ',':
+                    pos += 1  # consume the comma.
+                else:
+                    raise XconfigParserError("Parsing Append(), expected ')' or ',', got " + tokens[pos])
+
+                (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+                d.items.append(desc)
+        elif first_token == 'Round':
+            expect_token(',', tokens[pos], 'Round()')
+            pos += 1
+            try:
+                t_modulus = int(tokens[pos])
+                assert t_modulus > 0
+                pos += 1
+                d.items.append(t_modulus)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'Round()')
+            pos += 1
+        elif first_token == 'ReplaceIndex':
+            expect_token(',', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+            if tokens[pos] in [ 'x', 't' ]:
+                d.items.append(tokens[pos])
+                pos += 1
+            else:
+                raise XconfigParserError("Parsing ReplaceIndex(), expected 'x' or 't', got " +
+                                tokens[pos])
+            expect_token(',', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+            try:
+                new_value = int(tokens[pos])
+                pos += 1
+                d.items.append(new_value)
+            except:
+                raise XconfigParserError("Parsing Offset(), expected integer, got " + tokens[pos])
+            expect_token(')', tokens[pos], 'ReplaceIndex()')
+            pos += 1
+        else:
+            raise XconfigParserError("code error")
+    elif first_token in [ 'end of string', '(', ')', ',', '@' ]:
+        raise XconfigParserError("Expected descriptor, got " + first_token)
+    elif is_valid_line_name(first_token) or first_token == '[':
+        # This section parses a raw input/layer/output name, e.g. "affine2"
+        # (which must start with an alphabetic character or underscore),
+        # optionally followed by an offset like '@-3'.
+
+        d.operator = None
+        d.items = [first_token]
+
+        # If the layer-name o is followed by '@', then
+        # we're parsing something like 'affine1@-3' which
+        # is syntactic sugar for 'Offset(affine1, 3)'.
+        if tokens[pos] == '@':
+            pos += 1
+            try:
+                offset_t = int(tokens[pos])
+                pos += 1
+            except:
+                raise XconfigParserError("Parse error parsing {0}@{1}".format(
+                    first_token, tokens[pos]))
+            if offset_t != 0:
+                inner_d = d
+                d = Descriptor()
+                # e.g. foo@3 is equivalent to 'Offset(foo, 3)'.
+                d.operator = 'Offset'
+                d.items = [ inner_d, offset_t ]
+    else:
+        # the last possible case is that 'first_token' is just an integer i,
+        # which can appear in things like Append(-3, 0, 3).
+        # See if the token is an integer.
+        # In this case, it's interpreted as the name of previous layer
+        # (with that time offset applied).
+        try:
+            offset_t = int(first_token)
+        except:
+            raise XconfigParserError("Parsing descriptor, expected descriptor but got " +
+                            first_token)
+        assert isinstance(prev_names, list)
+        if len(prev_names) < 1:
+            raise XconfigParserError("Parsing descriptor, could not interpret '{0}' because "
+                            "there is no previous layer".format(first_token))
+        d.operator = None
+        # the layer name is the name of the most recent layer.
+        d.items = [prev_names[-1]]
+        if offset_t != 0:
+            inner_d = d
+            d = Descriptor()
+            d.operator = 'Offset'
+            d.items = [ inner_d, offset_t ]
+    return (d, pos)
+
+
+# This function takes a string 'descriptor_string' which might
+# look like 'Append([-1], [-2], input)', and a list of previous layer
+# names like prev_names = ['foo', 'bar', 'baz'], and replaces
+# the integers in brackets with the previous layers.  -1 means
+# the most recent previous layer ('baz' in this case), -2
+# means the last layer but one ('bar' in this case), and so on.
+# It will throw an exception if the number is out of range.
+# If there are no such expressions in the string, it's OK if
+# prev_names == None (this is useful for testing).
+def replace_bracket_expressions_in_descriptor(descriptor_string,
+                                         prev_names = None):
+    fields = re.split(r'(\[|\])\s*', descriptor_string)
+    out_fields = []
+    i = 0
+    while i < len(fields):
+        f = fields[i]
+        i += 1
+        if f == ']':
+            raise XconfigParserError("Unmatched ']' in descriptor")
+        elif f == '[':
+            if i + 2 >= len(fields):
+                raise XconfigParserError("Error tokenizing string '{0}': '[' found too close "
+                                "to the end of the descriptor.".format(descriptor_string))
+            assert isinstance(prev_names, list)
+            try:
+                offset = int(fields[i])
+                assert offset < 0 and -offset <= len(prev_names)
+                i += 2  # consume the int and the ']'.
+            except:
+                raise XconfigParserError("Error tokenizing string '{0}': expression [{1}] has an "
+                                "invalid or out of range offset.".format(descriptor_string, fields[i]))
+            this_field = prev_names[offset]
+            out_fields.append(this_field)
+        else:
+            out_fields.append(f)
+    return ''.join(out_fields)
+
+# tokenizes 'descriptor_string' into the tokens that may be part of Descriptors.
+# Note: for convenience in parsing, we add the token 'end-of-string' to this
+# list.
+# The argument 'prev_names' (for the names of previous layers and input and
+# output nodes) is needed to process expressions like [-1] meaning the most
+# recent layer, or [-2] meaning the last layer but one.
+# The default None for prev_names is only supplied for testing purposes.
+def tokenize_descriptor(descriptor_string,
+                       prev_names = None):
+    # split on '(', ')', ',', '@', and space.  Note: the parenthesis () in the
+    # regexp causes it to output the stuff inside the () as if it were a field,
+    # which is how the call to re.split() keeps characters like '(' and ')' as
+    # tokens.
+    fields = re.split(r'(\(|\)|@|,|\s)\s*',
+                      replace_bracket_expressions_in_descriptor(descriptor_string,
+                                                            prev_names))
+    ans = []
+    for f in fields:
+        # don't include fields that are space, or are empty.
+        if re.match(r'^\s*$', f) is None:
+            ans.append(f)
+
+    ans.append('end of string')
+    return ans
+
+
+# This function parses a line in a config file, something like
+# affine-layer name=affine1 input=Append(-3, 0, 3)
+# and returns a pair,
+# (first_token, fields), as (string, dict) e.g. in this case
+# ('affine-layer', {'name':'affine1', 'input':'Append(-3, 0, 3)"
+# Note: spaces are allowed in the field names but = signs are
+# disallowed, which is why it's possible to parse them.
+# This function also removes comments (anything after '#').
+# As a special case, this function will return None if the line
+# is empty after removing spaces.
+def parse_config_line(orig_config_line):
+    # Remove comments.
+    # note: splitting on '#' will always give at least one field...  python
+    # treats splitting on space as a special case that may give zero fields.
+    config_line = orig_config_line.split('#')[0]
+    if re.match('[^a-zA-Z0-9\.\-\(\)_\s"]', config_line) is not None:
+        raise XconfigParserError("Xconfig line has unknown characters.", config_line)
+
+    # Now split on space; later we may splice things back together.
+    fields=config_line.split()
+    if len(fields) == 0:
+        return None   # Line was only whitespace after removing comments.
+    first_token = fields[0]
+    # if first_token does not look like 'foo-bar' or 'foo-bar2', then die.
+    if re.match('^[a-z][-a-z0-9]+$', first_token) is None:
+        raise XconfigParserError("Error parsing config line (first field doesn't look right): {0}".format(
+            orig_config_line))
+    # get rid of the first field which we put in 'first_token'.
+    fields = fields[1:]
+
+    rest_of_line = ' '.join(fields)
+    # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
+    positions = map(lambda x: x.start(), re.finditer('"', rest_of_line))
+    if not len(positions) % 2 == 0:
+        raise XconfigParserError('"s should occur in pairs', config_line)
+
+    # add the " enclosed strings and corresponding keys to the dict
+    # and remove them from the rest_of_line
+    num_strings = len(positions) / 2
+    fields = []
+    for i in range(num_strings):
+        start = positions[i * 2]
+        end = positions[i * 2 + 1]
+        rest_of_line_after = rest_of_line[end + 1:]
+        parts = rest_of_line[:start].split()
+        rest_of_line_before = ' '.join(parts[:-1])
+        assert(parts[-1][-1] == '=')
+        fields.append(parts[-1][:-1])
+        fields.append(rest_of_line[start + 1 : end])
+        rest_of_line = rest_of_line_before + ' ' + rest_of_line_after
+
+    # suppose rest_of_line is: 'input=Append(foo, bar) foo=bar'
+    # then after the below we'll get
+    # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar']
+    ans_dict = dict()
+    other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line)
+    if not (other_fields[0] == '' and len(other_fields) % 2 ==  1):
+        raise XconfigParserError("Could not parse config line: " + orig_config_line)
+    fields += other_fields[1:]
+    num_variables = len(fields) / 2
+    for i in range(num_variables):
+        var_name = fields[i * 2]
+        var_value = fields[i * 2 + 1]
+        if re.match(r'[a-zA-Z_]', var_name) is None:
+            raise XconfigParserError("Expected variable name '{0}' to start with alphabetic character or _, "
+                            "in config line {1}".format(var_name, orig_config_line))
+        if var_name in ans_dict:
+            raise XconfigParserError("Config line has multiply defined variable {0}: {1}".format(
+                var_name, orig_config_line))
+        ans_dict[var_name] = var_value
+    return (first_token, ans_dict)
+
+# Reads a config file and returns a list of objects, where each object
+# represents one line of the file.
+def read_config_file(filename):
+    try:
+        f = open(filename, "r")
+    except XconfigParserError as e:
+        raise XconfigParserError("Error reading config file {0}: {1}".format(
+            filename, repr(e)))
+    ans = []
+    prev_names = []
+    while True:
+        line = f.readline()
+        if line == '':
+            break
+        x = parse_config_line(line)
+        if x is None:
+            continue  # blank line
+        (first_token, key_to_value) = x
+        layer_object = config_line_to_object(first_token, key_to_value, prev_names)
+        ans.append(layer_object)
+        prev_names.append(layer_object.get_name())
+
+def test_library():
+    tokenize_test = lambda x: tokenize_descriptor(x)[:-1]  # remove 'end of string'
+    assert tokenize_test("hi") == ['hi']
+    assert tokenize_test("hi there") == ['hi', 'there']
+    assert tokenize_test("hi,there") == ['hi', ',', 'there']
+    assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there']
+    assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')']
+    assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ]
+    assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ]
+
+    assert Descriptor('foo').str() == 'foo'
+    assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)'
+    assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))'
+    for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))',
+               'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))',
+               'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]:
+        if not Descriptor(x).str() == x:
+            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x))
+
+    prev_names = ['last_but_one_layer', 'prev_layer']
+    for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'),
+                  ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'),
+                  ('Append(input@-3, input@0, input@3)',
+                   'Append(Offset(input, -3), input, Offset(input, 3))'),
+                  ('Append(-3,0,3)',
+                   'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'),
+                  ('[-1]', 'prev_layer'),
+                  ('[-2]', 'last_but_one_layer'),
+                  ('[-2]@3',
+                   'Offset(last_but_one_layer, 3)') ]:
+        if not Descriptor(x, prev_names).str() == y:
+            print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y))
+
+
+    print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar'))
+    print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar opt2="a=1 b=2"'))
+    print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar'))
+    print(parse_config_line('affine-layer'))
+
+if __name__ == "__main__":
+    test_library()
diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
index f012d06cca9..d58db33bf98 100644
--- a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
+++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
@@ -169,7 +169,8 @@ def PrepareInitialAcousticModel(dir, run_opts):
                    command = run_opts.command, dir = dir))
 
 def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
-                  egs_dir, leaky_hmm_coefficient, l2_regularize,
+                  egs_dir, left_context, right_context,
+                  leaky_hmm_coefficient, l2_regularize,
                   xent_regularize, run_opts):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
@@ -188,10 +189,13 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
 nnet3-chain-combine --num-iters=40 \
    --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
+   --verbose=3 {dir}/den.fst {raw_models} \
+   "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \
+        nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:-|" \
 "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
     """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                dir = dir, raw_models = " ".join(raw_model_strings),
                num_chunk_per_minibatch = num_chunk_per_minibatch,
@@ -201,9 +205,20 @@ def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = 'final',
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                l2_regularize = l2_regularize,
+                                xent_regularize = xent_regularize,
+                                leaky_hmm_coefficient = leaky_hmm_coefficient,
+                                run_opts = run_opts,
+                                wait = False)
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
+def ComputeTrainCvProbabilities(dir, iter,
+                                egs_dir, left_context, right_context,
+                                l2_regularize, xent_regularize,
                                 leaky_hmm_coefficient, run_opts, wait = False):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
@@ -213,9 +228,10 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
   "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs ark:-| nnet3-chain-merge-egs ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir, iter = iter, model = model,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                xent_reg = xent_regularize,
                egs_dir = egs_dir), wait = wait)
@@ -225,11 +241,12 @@ def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regulari
   nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
   --xent-regularize={xent_reg} \
   "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
-        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs ark:- | nnet3-chain-merge-egs ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                model = model,
+               lc = left_context, rc = right_context,
                l2 = l2_regularize, leaky = leaky_hmm_coefficient,
                xent_reg = xent_regularize,
                egs_dir = egs_dir), wait = wait)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index cd9ebf4c7a3..53bd9f8924b 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -118,11 +118,7 @@ def GetArgs():
                         " chain model's output")
     parser.add_argument("--chain.left-deriv-truncate", type=int,
                         dest='left_deriv_truncate',
-                        default = None, help="")
-    parser.add_argument("--chain.right-deriv-truncate", type=int,
-                        dest='right_deriv_truncate',
-                        default = None, help="")
-
+                        default = None, help="Deprecated. Kept for back compatibility")
 
     # trainer options
     parser.add_argument("--trainer.srand", type=int, dest='srand',
@@ -224,6 +220,12 @@ def GetArgs():
     parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
                         default=512,
                         help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.deriv-truncate-margin", type=int, dest='deriv_truncate_margin',
+                        default = None,
+                        help="If specified, it is the number of frames that the derivative will be backpropagated through the chunk boundaries, "
+                        "e.g., During BLSTM model training if the chunk-width=150 and deriv-truncate-margin=5, then the derivative will be "
+                        "backpropagated up to t=-5 and t=154 in the forward and backward LSTM sequence respectively; "
+                        "otherwise, the derivative will be backpropagated to the end of the sequence.")
 
     # General options
     parser.add_argument("--stage", type=int, default=-4,
@@ -284,6 +286,12 @@ def ProcessArgs(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
+    if not args.left_deriv_truncate is None:
+        args.deriv_truncate_margin = -args.left_deriv_truncate
+        logger.warning("--chain.left-deriv-truncate (deprecated) is set by user, "
+                "and --trainer.deriv-truncate-margin is set to negative of that value={0}. "
+                "We recommend using the option --trainer.deriv-truncate-margin.".format(args.deriv_truncate_margin))
+
     if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
         raise Exception("""This scripts expects {0} to exist and have a configs
         directory which is the output of make_configs.py script""")
@@ -325,9 +333,9 @@ def __init__(self):
 
 
 def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
+                   raw_model_string, egs_dir, left_context, right_context,
                    apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
+                   min_deriv_time, max_deriv_time,
                    l2_regularize, xent_regularize, leaky_hmm_coefficient,
                    momentum, max_param_change,
                    shuffle_buffer_size, num_chunk_per_minibatch,
@@ -340,10 +348,10 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
       # but we use the same script for consistency with FF-DNN code
 
     deriv_time_opts=""
-    if left_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
-    if right_deriv_truncate is not None:
-        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+    if not min_deriv_time is None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(min_deriv_time)
+    if not max_deriv_time is None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(max_deriv_time)
 
     processes = []
     for job in range(1,num_jobs+1):
@@ -366,7 +374,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
    "{raw_model}" {dir}/den.fst \
-  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  "ark,bg:nnet3-chain-copy-egs --left-context={lc} --right-context={rc} --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
                      train_queue_opt = run_opts.train_queue_opt,
@@ -379,11 +387,12 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
                      parallel_train_opts = run_opts.parallel_train_opts,
                      momentum = momentum, max_param_change = max_param_change,
                      raw_model = raw_model_string,
-                     egs_dir = egs_dir, archive_index = archive_index,
+                     egs_dir = egs_dir, lc=left_context, rc=right_context,
+                     archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
                      cache_io_opts = cur_cache_io_opts,
                      num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
+                     wait = False)
 
         processes.append(process_handle)
 
@@ -404,7 +413,8 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       num_jobs, num_archives_processed, num_archives,
                       learning_rate, shrinkage_value, num_chunk_per_minibatch,
                       num_hidden_layers, add_layers_period,
-                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      left_context, right_context,
+                      apply_deriv_weights, min_deriv_time, max_deriv_time,
                       l2_regularize, xent_regularize, leaky_hmm_coefficient,
                       momentum, max_param_change, shuffle_buffer_size,
                       frame_subsampling_factor, truncate_deriv_weights,
@@ -427,8 +437,15 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
-            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+    chain_lib.ComputeTrainCvProbabilities(dir = dir,
+                                          iter = iter,
+                                          egs_dir = egs_dir,
+                                          left_context = left_context,
+                                          right_context = right_context,
+                                          l2_regularize = l2_regularize,
+                                          xent_regularize = xent_regularize,
+                                          leaky_hmm_coefficient = leaky_hmm_coefficient,
+                                          run_opts = run_opts)
 
     if iter > 0:
         chain_lib.ComputeProgress(dir, iter, run_opts)
@@ -460,15 +477,30 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
       cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
       cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   apply_deriv_weights,
-                   left_deriv_truncate, right_deriv_truncate,
-                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
-                   momentum, cur_max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   frame_subsampling_factor, truncate_deriv_weights,
-                   cache_io_opts, run_opts)
+    TrainNewModels(dir = dir,
+                   iter = iter,
+                   srand = srand,
+                   num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   left_context = left_context,
+                   right_context = right_context,
+                   apply_deriv_weights = apply_deriv_weights,
+                   min_deriv_time = min_deriv_time,
+                   max_deriv_time = max_deriv_time,
+                   l2_regularize = l2_regularize,
+                   xent_regularize = xent_regularize,
+                   leaky_hmm_coefficient = leaky_hmm_coefficient,
+                   momentum = momentum,
+                   max_param_change = cur_max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor = frame_subsampling_factor,
+                   truncate_deriv_weights = truncate_deriv_weights,
+                   cache_io_opts = cache_io_opts,
+                   run_opts = run_opts)
 
     [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
@@ -567,14 +599,15 @@ def Train(args, run_opts):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    egs_left_context = left_context + args.frame_subsampling_factor/2
+    egs_right_context = right_context + args.frame_subsampling_factor/2
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
         # this is where get_egs.sh is called.
         chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
-                                    left_context + args.frame_subsampling_factor/2,
-                                    right_context + args.frame_subsampling_factor/2,
+                                    egs_left_context, egs_right_context,
                                     run_opts,
                                     left_tolerance = args.left_tolerance,
                                     right_tolerance = args.right_tolerance,
@@ -594,7 +627,7 @@ def Train(args, run_opts):
     else:
         egs_dir = args.egs_dir
 
-    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, egs_left_context, egs_right_context)
     assert(args.chunk_width == frames_per_eg)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
@@ -638,6 +671,12 @@ def Train(args, run_opts):
                                                                                            args.initial_effective_lrate,
                                                                                            args.final_effective_lrate)
 
+    min_deriv_time = None
+    max_deriv_time = None
+    if not args.deriv_truncate_margin is None:
+        min_deriv_time = -args.deriv_truncate_margin
+        max_deriv_time = args.chunk_width - 1 + args.deriv_truncate_margin
+
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
@@ -653,18 +692,32 @@ def Train(args, run_opts):
                 shrinkage_value = args.shrink_value
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value,
-                              args.num_chunk_per_minibatch,
-                              num_hidden_layers, args.add_layers_period,
-                              args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate,
-                              args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size,
-                              args.frame_subsampling_factor,
-                              args.truncate_deriv_weights, run_opts)
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value = shrinkage_value,
+                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              apply_deriv_weights = args.apply_deriv_weights,
+                              min_deriv_time = min_deriv_time,
+                              max_deriv_time = max_deriv_time,
+                              l2_regularize = args.l2_regularize,
+                              xent_regularize = args.xent_regularize,
+                              leaky_hmm_coefficient = args.leaky_hmm_coefficient,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              frame_subsampling_factor = args.frame_subsampling_factor,
+                              truncate_deriv_weights = args.truncate_deriv_weights,
+                              run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -683,10 +736,17 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
-                args.num_chunk_per_minibatch, egs_dir,
-                args.leaky_hmm_coefficient, args.l2_regularize,
-                args.xent_regularize, run_opts)
+        chain_lib.CombineModels(dir = args.dir,
+                                num_iters = num_iters,
+                                num_iters_combine = num_iters_combine,
+                                num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                leaky_hmm_coefficient = args.leaky_hmm_coefficient,
+                                l2_regularize = args.l2_regularize,
+                                xent_regularize = args.xent_regularize,
+                                run_opts = run_opts)
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory {0}".format(args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 95f6c784851..4bfcb219fc3 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -96,7 +96,7 @@ def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""
 
     # Per-component max-change option
     max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''
- 
+
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
     component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
 
@@ -111,7 +111,7 @@ def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options
     self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
     # Per-component max-change option
     max_change_options = "max-change={0:.2f}".format(max_change_per_component) if max_change_per_component is not None else ''
- 
+
     components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input['dimension'], output_dim, ng_affine_options, max_change_options))
     components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string))
     components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))
@@ -484,4 +484,4 @@ def AddBLstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
- 
+
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index 2290c4d2e7f..2a6499090e2 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -90,11 +90,12 @@ def GetDotNodeName(name_string, is_component = False):
     # this function is required as dot does not allow all the component names
     # allowed by nnet3.
     # Identified incompatibilities :
-    #   1. dot does not allow hyphen(-) in names
+    #   1. dot does not allow hyphen(-) and dot(.) in names
     #   2. Nnet3 names can be shared among components and component nodes
     #      dot does not allow common names
     #
     node_name_string = re.sub("-", "hyphen", name_string)
+    node_name_string = re.sub("\.", "_dot_", node_name_string)
     if is_component:
         node_name_string += node_name_string.strip() + "_component"
     return {"label":name_string, "node":node_name_string}
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
new file mode 100644
index 00000000000..e6dc907fe0a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/nnet3_libs/train/__init__.py
@@ -0,0 +1 @@
+# This module will house the latest training libraries being written by Vimal
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
index c36de8c16bf..06ccf9657be 100755
--- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
+++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
@@ -17,6 +17,7 @@ if [ $# != 3 ]; then
   echo " e.g.: $0 exp/sdm1/nnet3/lstm_sp/0.mdl lstm.dot lstm.png"
   echo ""
   echo "Main options (for others, see top of script file)"
+  echo "  --info-bin <nnet3-am-info|nnet3-info>        # Name of the binary to generate the nnet3 file"
   echo "  --component-attributes <string|name,type>     # attributes to be printed in nnet3 components"
   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
   echo "                                                # will be clustered together in the dot-graph"
@@ -34,6 +35,7 @@ $info_bin $model | \
   steps/nnet3/dot/nnet3_to_dot.py \
     --component-attributes "$component_attributes" \
     $attr $dot_file
+echo "Generated the dot file $dot_file"
 
 command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
 dot -Tpdf $dot_file -o $output_file
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index a43aa05176b..e92ab05a847 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -252,7 +252,10 @@ def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
             raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
 
         if (egs_left_context < left_context) or (egs_right_context < right_context):
-            raise Exception('The egs have insufficient context')
+            raise Exception('The egs have insufficient context.'
+                            ' Required left context is {rlc} and available left context is {alc}.'
+                            ' Required right context is {rrc} and available right context is {arc}.'.format(rlc = left_context, alc = egs_left_context,
+                                                                                                            rrc = right_context, arc = egs_right_context))
 
         frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
         num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
@@ -506,52 +509,65 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False):
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, left_context, right_context,
+                                run_opts, mb_size=256, wait = False):
 
     model = '{0}/{1}.mdl'.format(dir, iter)
 
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
   nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+        "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/valid_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
   nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-       "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
+        "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:- |"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
 
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
+def ComputeProgress(dir, iter, egs_dir, left_context, right_context,
+                    run_opts, mb_size=256, wait=False):
 
     prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
     model = '{0}/{1}.mdl'.format(dir, iter)
+
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
 nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
 nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
-"ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
+"ark,bg:nnet3-copy-egs {context_opts}  ark:{egs_dir}/train_diagnostic.egs ark:- | nnet3-merge-egs --minibatch-size={mb_size} ark:- ark:-|"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                model = model,
                mb_size = mb_size,
                prev_model = prev_model,
+               context_opts = context_opts,
                egs_dir = egs_dir), wait = wait)
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
-                  run_opts, chunk_width = None):
+                  run_opts, left_context, right_context, chunk_width = None):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -570,26 +586,39 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     else:
         mbsize = 1024
 
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
 nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
-   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+   --verbose=3 {raw_models} "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/combine.egs ark:- | \
+   nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:- ark:-|" \
 "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
     """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
                dir = dir, raw_models = " ".join(raw_model_strings),
                mbsize = mbsize,
                num_iters = num_iters,
+               context_opts = context_opts,
                egs_dir = egs_dir))
 
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = 'combined',
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                run_opts = run_opts,
+                                wait = False)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts):
+                            prior_subset_size, left_context, right_context,
+                            run_opts):
     # Note: this just uses CPUs, using a smallish subset of data.
     """ Computes the average posterior of the network"""
     import glob
@@ -601,19 +630,24 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     else:
         egs_part = 'JOB'
 
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+
     RunKaldiCommand("""
 {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
-    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:- ark:- \| \
     nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
-matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
+    "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
     """.format(command = run_opts.command,
                dir = dir,
                num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
                prior_queue_opt = run_opts.prior_queue_opt,
                iter = iter, prior_subset_size = prior_subset_size,
                egs_dir = egs_dir, egs_part = egs_part,
+               context_opts = context_opts,
                prior_gpu_opt = run_opts.prior_gpu_opt))
 
     # make sure there is time for $dir/post.{iter}.*.vec to appear.
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index ea8f41749da..26ca16c364b 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -102,13 +102,23 @@ def Compile(self):
         lat_file.close()
         logger.info("Compiling the latex report.")
         try:
-            proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            proc = subprocess.Popen(['pdflatex', '-interaction=batchmode', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             proc.communicate()
         except Exception as e:
             logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file))
             return False
         return True
 
+def LatexCompliantName(name_string):
+    # this function is required as latex does not allow all the component names
+    # allowed by nnet3.
+    # Identified incompatibilities :
+    #   1. latex does not allow dot(.) in file names
+    #
+    node_name_string = re.sub("\.", "_dot_", name_string)
+
+    return node_name_string
+
 def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None):
     assert(start_iter >= 1)
 
@@ -240,7 +250,8 @@ def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, s
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name))
@@ -317,7 +328,8 @@ def GenerateClippedProportionPlots(exp_dir, output_dir, plot, comparison_dir = N
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Clipped-proportion value at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/clipped_proportion_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Clipped proportion at {0}".format(component_name))
@@ -417,7 +429,8 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None,
             lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
             plt.grid(True)
             fig.suptitle("Parameter differences at {comp_name}".format(comp_name = component_name))
-            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            comp_name = LatexCompliantName(component_name)
+            figfile_name = '{dir}/param_diff_{comp_name}.pdf'.format(dir = output_dir, comp_name = comp_name)
             fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index e4a9e617e48..4139d446872 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -359,10 +359,14 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.write(str(srand))
         f.close()
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+    ComputeTrainCvProbabilities(dir=dir, iter=iter, egs_dir=egs_dir,
+                                left_context=left_context, right_context=right_context,
+                                run_opts=run_opts)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts)
+        ComputeProgress(dir=dir, iter=iter, egs_dir=egs_dir,
+                        left_context=left_context, right_context=right_context,
+                        run_opts=run_opts)
 
     if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
 
@@ -578,14 +582,24 @@ def Train(args, run_opts):
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              args.minibatch_size, args.frames_per_eg,
-                              num_hidden_layers, args.add_layers_period,
-                              left_context, right_context,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size, run_opts)
+            TrainOneIteration(dir = args.dir,
+                              iter = iter,
+                              srand = args.srand,
+                              egs_dir = egs_dir,
+                              num_jobs = current_num_jobs,
+                              num_archives_processed = num_archives_processed,
+                              num_archives = num_archives,
+                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                              minibatch_size = args.minibatch_size,
+                              frames_per_eg = args.frames_per_eg,
+                              num_hidden_layers = num_hidden_layers,
+                              add_layers_period = args.add_layers_period,
+                              left_context = left_context,
+                              right_context = right_context,
+                              momentum = args.momentum,
+                              max_param_change = args.max_param_change,
+                              shuffle_buffer_size = args.shuffle_buffer_size,
+                              run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -604,12 +618,24 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+        CombineModels(dir = args.dir,
+                      num_iters = num_iters,
+                      num_iters_combine = num_iters_combine,
+                      egs_dir = egs_dir,
+                      left_context = left_context,
+                      right_context = right_context,
+                      run_opts = run_opts)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = ComputeAveragePosterior(dir = args.dir,
+                                                    iter = 'combined',
+                                                    egs_dir = egs_dir,
+                                                    num_archives = num_archives,
+                                                    prior_subset_size = args.prior_subset_size,
+                                                    left_context = left_context,
+                                                    right_context = right_context,
+                                                    run_opts = run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 7ac7a58a3d5..89db4276cfc 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -194,7 +194,7 @@ def GetArgs():
                         help="Number of sequences to be processed in parallel every minibatch" )
     parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
                         default=None,
-                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is set to (chunk-width + 10)." )
 
     # General options
     parser.add_argument("--stage", type=int, default=-4,
@@ -346,7 +346,7 @@ def __init__(self):
 
 def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
                    raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
+                   left_context, right_context, min_deriv_time, max_deriv_time,
                    momentum, max_param_change,
                    shuffle_buffer_size, num_chunk_per_minibatch,
                    cache_read_opt, run_opts):
@@ -375,7 +375,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
   nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
   --print-interval=10 --momentum={momentum} \
   --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  --optimization.min-deriv-time={min_deriv_time} --optimization.max-deriv-time={max_deriv_time} "{raw_model}" \
   "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
   {dir}/{next_iter}.{job}.raw
           """.format(command = run_opts.command,
@@ -384,7 +384,7 @@ def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archi
                      parallel_train_opts = run_opts.parallel_train_opts,
                      cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
                      momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
+                     min_deriv_time = min_deriv_time, max_deriv_time = max_deriv_time,
                      raw_model = raw_model_string, context_opts = context_opts,
                      egs_dir = egs_dir, archive_index = archive_index,
                      shuffle_buffer_size = shuffle_buffer_size,
@@ -409,7 +409,7 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
                       num_jobs, num_archives_processed, num_archives,
                       learning_rate, shrinkage_value, num_chunk_per_minibatch,
                       num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
+                      left_context, right_context, min_deriv_time, max_deriv_time,
                       momentum, max_param_change, shuffle_buffer_size,
                       cv_minibatch_size, run_opts):
     # Set off jobs doing some diagnostics, in the background.
@@ -430,10 +430,22 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
         f.close()
 
 
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
+    ComputeTrainCvProbabilities(dir = dir,
+                                iter = iter,
+                                egs_dir = egs_dir,
+                                left_context = left_context,
+                                right_context = right_context,
+                                run_opts = run_opts,
+                                mb_size=cv_minibatch_size)
 
     if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
+        ComputeProgress(dir = dir,
+                        iter = iter,
+                        egs_dir = egs_dir,
+                        left_context = left_context,
+                        right_context = right_context,
+                        run_opts = run_opts,
+                        mb_size=cv_minibatch_size)
 
     # an option for writing cache (storing pairs of nnet-computations
     # and computation-requests) during training.
@@ -467,12 +479,24 @@ def TrainOneIteration(dir, iter, srand, egs_dir,
     except OSError:
         pass
 
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
+    TrainNewModels(dir = dir,
+                   iter = iter,
+                   srand = srand,
+                   num_jobs = num_jobs,
+                   num_archives_processed = num_archives_processed,
+                   num_archives = num_archives,
+                   raw_model_string = raw_model_string,
+                   egs_dir = egs_dir,
+                   left_context = left_context,
+                   right_context = right_context,
+                   min_deriv_time = min_deriv_time,
+                   max_deriv_time = max_deriv_time,
+                   momentum = momentum,
+                   max_param_change = max_param_change,
+                   shuffle_buffer_size = shuffle_buffer_size,
+                   num_chunk_per_minibatch = cur_num_chunk_per_minibatch,
+                   cache_read_opt = cache_read_opt,
+                   run_opts = run_opts)
     [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
     nnets_list = []
     for n in models_to_average:
@@ -627,11 +651,13 @@ def Train(args, run_opts):
     cur_egs_dir=egs_dir
 
     if args.num_bptt_steps is None:
-        num_bptt_steps = args.chunk_width
+        # num_bptt_steps is set to (chunk_width + 10) by default
+        num_bptt_steps = args.chunk_width + min(10, args.chunk_left_context, args.chunk_right_context)
     else:
         num_bptt_steps = args.num_bptt_steps
 
     min_deriv_time = args.chunk_width - num_bptt_steps
+    max_deriv_time = num_bptt_steps - 1
 
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
@@ -672,6 +698,7 @@ def Train(args, run_opts):
                               left_context = left_context,
                               right_context = right_context,
                               min_deriv_time = min_deriv_time,
+                              max_deriv_time = max_deriv_time,
                               momentum = args.momentum,
                               max_param_change= args.max_param_change,
                               shuffle_buffer_size = args.shuffle_buffer_size,
@@ -696,13 +723,25 @@ def Train(args, run_opts):
 
     if args.stage <= num_iters:
         logger.info("Doing final combination to produce final.mdl")
-        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
-                chunk_width = args.chunk_width)
+        CombineModels(dir = args.dir,
+                      num_iters = num_iters,
+                      num_iters_combine = num_iters_combine,
+                      egs_dir = egs_dir,
+                      left_context = left_context,
+                      right_context = right_context,
+                      run_opts = run_opts,
+                      chunk_width = args.chunk_width)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of adjusting the priors.")
-        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
-                                num_archives, args.prior_subset_size, run_opts)
+        avg_post_vec_file = ComputeAveragePosterior(dir = args.dir,
+                                                    iter = 'combined',
+                                                    egs_dir = egs_dir,
+                                                    num_archives = num_archives,
+                                                    prior_subset_size = args.prior_subset_size,
+                                                    left_context = left_context,
+                                                    right_context = right_context,
+                                                    run_opts = run_opts)
 
         logger.info("Re-adjusting priors based on computed posteriors")
         combined_model = "{dir}/combined.mdl".format(dir = args.dir)
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
new file mode 100755
index 00000000000..e29a9404403
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import shlex
+import sys
+import warnings
+import copy
+import imp
+import ast
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+# do the proper import when python scripts have been refactored
+nnet3_lib = imp.load_source('', 'steps/nnet3/nnet3_train_lib.py')
+
+def get_args():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description='Reads an xconfig file and creates config files '
+                                     'for neural net creation and training',
+                                     epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--config-dir', required=True,
+                        help='Directory to write config files and variables')
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = check_args(args)
+
+    return args
+
+def check_args(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+    return args
+
+
+
+
+def backup_xconfig_file(xconfig_file, config_dir):
+    # we write a copy of the xconfig file just to have a record of the original
+    # input.
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig for output'.format(
+            sys.argv[0], config_dir))
+    try:
+        xconfig_file_in = open(xconfig_file)
+    except:
+        sys.exit('{0}: error opening file {1} for input'.format(sys.argv[0], config_dir))
+
+    print("# This file was created by the command:\n"
+          "# {0}\n"
+          "# It is a copy of the source from which the config files in "
+          "# this directory were generated.\n".format(' '.join(sys.argv)),
+          file=xconfig_file_out)
+
+    while True:
+        line = xconfig_file_in.readline()
+        if line == '':
+            break
+        print(line.strip(), file=xconfig_file_out)
+    xconfig_file_out.close()
+    xconfig_file_in.close()
+
+
+# This functions writes config_dir/xconfig.expanded.1 and
+# config_dir/xconfig.expanded.2, showing some of the internal stages of
+# processing the xconfig file before turning it into config files.
+def write_expanded_xconfig_files(config_dir, all_layers):
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig.expanded.1', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig.expanded.1 for output'.format(
+            sys.argv[0], config_dir))
+
+
+    print('# This file was created by the command:\n'
+          '# ' + ' '.join(sys.argv) + '\n'
+          '#It contains the same content as ./xconfig but it was parsed and\n'
+          '#default config values were set.\n'
+          '# See also ./xconfig.expanded.2\n', file=xconfig_file_out)
+
+    for layer in all_layers:
+        print(str(layer), file=xconfig_file_out)
+    xconfig_file_out.close()
+
+    try:
+        xconfig_file_out = open(config_dir + '/xconfig.expanded.2', 'w')
+    except:
+        sys.exit('{0}: error opening file {1}/xconfig.expanded.2 for output'.format(
+                sys.argv[0], config_dir))
+
+    print('# This file was created by the command:\n'
+          '# ' + ' '.join(sys.argv) + '\n'
+          '# It contains the same content as ./xconfig but it was parsed,\n'
+          '# default config values were set, and Descriptors (input=xxx) were normalized.\n'
+          '# See also ./xconfig.expanded.1\n\n',
+          file=xconfig_file_out)
+
+    for layer in all_layers:
+        layer.normalize_descriptors()
+        print(str(layer), file=xconfig_file_out)
+    xconfig_file_out.close()
+
+# This function returns a map from config-file basename
+# e.g. 'init', 'ref', 'layer1' to a documentation string that goes
+# at the top of the file.
+def get_config_headers():
+    ans = defaultdict(str)  # resulting dict will default to the empty string
+                            # for any config files not explicitly listed here.
+    ans['init'] = ('# This file was created by the command:\n'
+                   '# ' + ' '.join(sys.argv) + '\n'
+                   '# It contains the input of the network and is used in\n'
+                   '# accumulating stats for an LDA-like transform of the\n'
+                   '# input features.\n');
+    ans['ref'] = ('# This file was created by the command:\n'
+                  '# ' + ' '.join(sys.argv) + '\n'
+                  '# It contains the entire neural network, but with those\n'
+                  '# components that would normally require fixed vectors/matrices\n'
+                  '# read from disk, replaced with random initialization\n'
+                  '# (this applies to the LDA-like transform and the\n'
+                  '# presoftmax-prior-scale, if applicable).  This file\n'
+                  '# is used only to work out the left-context and right-context\n'
+                  '# of the network.\n');
+    ans['final'] = ('# This file was created by the command:\n'
+                    '# ' + ' '.join(sys.argv) + '\n'
+                    '# It contains the entire neural network.\n')
+
+    return ans;
+
+
+
+
+# This is where most of the work of this program happens.
+def write_config_files(config_dir, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    config_basename_to_header = get_config_headers()
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                    "line '{1}': error was: {2}".format(sys.argv[0], str(layer),
+                                                        repr(e)), file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    for basename,lines in config_basename_to_lines.items():
+        header = config_basename_to_header[basename]
+        filename = '{0}/{1}.config'.format(config_dir, basename)
+        try:
+            f = open(filename, 'w')
+            print(header, file=f)
+            for line in lines:
+                print(line, file=f)
+            f.close()
+        except Exception as e:
+            print('{0}: error writing to config file {1}: error is {2}'.format(
+                    sys.argv[0], filename, repr(e)), file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+def add_back_compatibility_info(config_dir):
+    """This will be removed when python script refactoring is done."""
+
+    nnet3_lib.RunKaldiCommand("nnet3-init {0}/ref.config {0}/ref.raw".format(config_dir))
+    out, err = nnet3_lib.RunKaldiCommand("nnet3-info {0}/ref.raw | head -4".format(config_dir))
+    #out looks like this
+    # left-context: 7
+    # right-context: 0
+    # num-parameters: 90543902
+    # modulus: 1
+    info = {}
+    for line in out.split("\n"):
+        parts = line.split(":")
+        if len(parts) != 2:
+            continue
+        info[parts[0].strip()] = int(parts[1].strip())
+
+    # Writing the back-compatible vars file
+    #   model_left_context=0
+    #   model_right_context=7
+    #   num_hidden_layers=3
+    vf = open('{0}/vars'.format(config_dir), 'w')
+    vf.write('model_left_context={0}\n'.format(info['left-context']))
+    vf.write('model_right_context={0}\n'.format(info['right-context']))
+    vf.write('num_hidden_layers=1\n')
+    vf.close()
+
+    nnet3_lib.ForceSymlink("final.config".format(config_dir),
+                           "{0}/layer1.config".format(config_dir))
+
+def main():
+    args = get_args()
+    backup_xconfig_file(args.xconfig_file, args.config_dir)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file)
+    write_expanded_xconfig_files(args.config_dir, all_layers)
+    write_config_files(args.config_dir, all_layers)
+    add_back_compatibility_info(args.config_dir)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+#  mkdir -p foo; (echo 'input dim=40 name=input'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+
+# mkdir -p foo; (echo 'input dim=40 name=input'; echo 'relu-renorm-layer name=affine1 dim=1024'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
+
+# mkdir -p foo; (echo 'input dim=100 name=ivector'; echo 'input dim=40 name=input'; echo 'fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=foo/bar/lda.mat'; echo 'output-layer name=output dim=1924 input=Append(-1,0,1)')  >xconfig; ./xconfig_to_configs.py xconfig foo
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index ea5264a0f07..054210cdd23 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -51,7 +51,6 @@
 # Begin configuration section.
 num_sil_states=5
 num_nonsil_states=3
-num_word_disambig_syms=1
 position_dependent_phones=true
 # position_dependent_phones is false also when position dependent phones and word_boundary.txt
 # have been generated by another source
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 00ed56308b3..b0c963595a1 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -316,15 +316,15 @@ int main(int argc, char *argv[]) {
           num_written++;
         }
       } else if (count > 0) {
-        const NnetChainExample &eg = example_reader.Value();
+        NnetChainExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
         NnetChainExample eg_out;
         if (left_context != -1 || right_context != -1)
           ModifyChainExampleContext(eg, left_context, right_context,
                                     frame_subsampling_factor, &eg_out);
         else
-          eg_out = eg;
-        if (frame_shift != 0)
-          ShiftChainExampleTimes(frame_shift, exclude_names, &eg_out);
+          eg_out.Swap(&eg);
         if (truncate_deriv_weights != 0)
           TruncateDerivWeights(truncate_deriv_weights, &eg_out);
         for (int32 c = 0; c < count; c++) {
@@ -344,5 +344,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index af2147147d7..ad5f715a294 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -783,6 +783,13 @@ Nnet& Nnet::operator =(const Nnet &nnet) {
 
 std::string Nnet::Info() const {
   std::ostringstream os;
+
+  if(IsSimpleNnet(*this))  {
+    int32 left_context, right_context;
+    ComputeSimpleNnetContext(*this, &left_context, &right_context);
+    os << "left-context: " << left_context << "\n";
+    os << "right-context: " << right_context << "\n";
+  }
   os << "num-parameters: " << NumParameters(*this) << "\n";
   os << "modulus: " << this->Modulus() << "\n";
   std::vector<std::string> config_lines;
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 733d162748e..3bacf455f3b 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -427,7 +427,7 @@ bool IsValidName(const std::string &name) {
   for (size_t i = 0; i < name.size(); i++) {
     if (i == 0 && !isalpha(name[i]) && name[i] != '_')
       return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-')
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
       return false;
   }
   return true;
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index b84ac90c76e..f48885175b4 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -96,16 +96,16 @@ void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat dropout_proportion = 0.0;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
-  if (!ok || cfl->HasUnusedValues() || dim <= 0 || 
+  if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
-    KALDI_ERR << "Invalid initializer for layer of type " 
-              << Type() << ": \"" << cfl->WholeLine() << "\"";   
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
   Init(dim, dropout_proportion);
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
-  stream << Type() << ", dim = " << dim_ 
+  stream << Type() << ", dim = " << dim_
          << ", dropout-proportion = " << dropout_proportion_;
   return stream.str();
 }
@@ -119,12 +119,12 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
 
-  // This const_cast is only safe assuming you don't attempt  
+  // This const_cast is only safe assuming you don't attempt
   // to use multi-threaded code with the GPU.
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out); 
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-  out->Add(-dropout); // now, a proportion "dropout" will be <0.0 
-  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will 
+  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
                          // be zero and (1 - dropout) will be 1.0.
 
   out->MulElements(in);
@@ -147,7 +147,7 @@ void DropoutComponent::Backprop(const std::string &debug_info,
 }
 
 
- 
+
 void DropoutComponent::Read(std::istream &is, bool binary) {
   ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
   ReadBasicType(is, binary, &dim_);
diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h
index cfd70ccea38..af7c18da64b 100644
--- a/src/nnet3/online-nnet3-decodable-simple.h
+++ b/src/nnet3/online-nnet3-decodable-simple.h
@@ -102,6 +102,7 @@ class DecodableNnet3SimpleOnline: public DecodableInterface {
   /// Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
+  int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; }
  private:
 
   /// If the neural-network outputs for this frame are not cached, it computes
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index fd4881666ae..8dd366166c0 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -72,8 +72,9 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
 
 bool SingleUtteranceNnet3Decoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
+  int32 subsample = decodable_.FrameSubsamplingFactor();
   return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds(),
+                                 feature_pipeline_->FrameShiftInSeconds() * subsample,
                                  decoder_);  
 }