diff --git a/egs/babel_multilang/s5/conf/common.fullLP b/egs/babel_multilang/s5/conf/common.fullLP
new file mode 100644
index 00000000000..264e51311b8
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/common.fullLP
@@ -0,0 +1,99 @@
+# BNF training parameters
+bnf_num_hidden_layers=6
+bottleneck_dim=42
+bnf_hidden_layer_dim=2048
+bnf_minibatch_size=512
+bnf_init_learning_rate=0.008
+bnf_final_learning_rate=0.0008
+bnf_max_change=40
+bnf_num_jobs=4
+bnf_num_threads=1
+bnf_mixup=10000
+bnf_mpe_learning_rate=0.00009
+bnf_mpe_last_layer_factor=0.1
+bnf_num_gauss_ubm=550 # use fewer UBM Gaussians than the
+                      # non-bottleneck system (which has 800)
+bnf_num_gauss_sgmm=50000 # use fewer SGMM sub-states than the
+                         # non-bottleneck system (which has 80000).
+bnf_decode_acwt=0.066666
+
+
+# DNN hybrid system training parameters
+dnn_num_hidden_layers=4
+dnn_input_dim=4000
+dnn_output_dim=400
+dnn_init_learning_rate=0.008
+dnn_final_learning_rate=0.0008
+dnn_mixup=12000
+
+dnn_mpe_learning_rate=0.00008
+dnn_mpe_last_layer_factor=0.1
+dnn_mpe_retroactive=true
+
+bnf_every_nth_frame=2 # take every 2nd frame.
+babel_type=full
+
+use_pitch=true
+
+lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 )
+
+dnn_beam=16.0
+dnn_lat_beam=8.5
+
+icu_opt=(--use-icu true --icu-transform Any-Lower)
+
+if [[ `hostname` == *.tacc.utexas.edu ]] ; then
+  decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" )
+  sgmm_train_extra_opts=( )
+  sgmm_group_extra_opts=( --num_iters 25 ) 
+  sgmm_denlats_extra_opts=( --num-threads 2 )
+  sgmm_mmi_extra_opts=(--cmd "local/lonestar.py -pe smp 2")
+  dnn_denlats_extra_opts=( --num-threads 2 )
+
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
+                         --parallel-opts "-pe smp 16" )
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1)
+
+  dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1)
+  dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1)
+  dnn_parallel_opts="-l gpu=1"
+else
+  decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=0.7G")
+  sgmm_train_extra_opts=( --num-iters 25 )
+  sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=2.75G" --cmd "queue.pl -l arch=*64 -l mem_free=3.0G,ram_free=3.0G") 
+  sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
+  sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=3.2G,ram_free=3.2G")
+  dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=0.8G")
+
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
+                         --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 8 --num-threads 1 \
+                         --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+  dnn_parallel_opts="-l gpu=1"
+  dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 8 --num-threads 1 \
+                             --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+fi
+ 
+icu_transform="Any-Lower"
+case_insensitive=true
+
+
+max_states=150000
+wip=0.5
+
+
+phoneme_mapping=
+
+minimize=true
+
+proxy_phone_beam=-1
+proxy_phone_nbest=-1
+proxy_beam=5
+proxy_nbest=500
+
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
diff --git a/egs/babel_multilang/s5/conf/common.limitedLP b/egs/babel_multilang/s5/conf/common.limitedLP
new file mode 100644
index 00000000000..49b8fc6ab7c
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/common.limitedLP
@@ -0,0 +1,104 @@
+# BNF training parameters
+bnf_num_hidden_layers=5
+bottleneck_dim=42
+bnf_hidden_layer_dim=1024
+bnf_minibatch_size=512
+bnf_init_learning_rate=0.008
+bnf_final_learning_rate=0.0008
+bnf_max_change=40
+bnf_num_jobs=4
+bnf_num_threads=1
+bnf_mixup=5000
+bnf_mpe_learning_rate=0.00009
+bnf_mpe_last_layer_factor=0.1
+bnf_num_gauss_ubm=500 # use fewer UBM Gaussians than the
+                      # non-bottleneck system (which has 750)
+bnf_num_gauss_sgmm=10000 # use fewer SGMM sub-states than the
+                         # non-bottleneck system (which has 18000).
+bnf_decode_acwt=0.066666
+
+
+## DNN hybrid system training parameters
+dnn_num_hidden_layers=3
+dnn_input_dim=2000
+dnn_output_dim=200
+dnn_init_learning_rate=0.008
+dnn_final_learning_rate=0.0008
+dnn_mixup=5000
+
+dnn_mpe_learning_rate=0.00009
+dnn_mpe_last_layer_factor=0.1
+dnn_mpe_retroactive=true
+
+bnf_every_nth_frame=1 # take all frames.
+babel_type=limited
+
+use_pitch=true
+
+lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 12 )
+lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 )
+lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 )
+
+dnn_beam=16.0
+dnn_lat_beam=8.5
+
+icu_opt=(--use-icu true --icu-transform Any-Lower)
+
+# Semi-supervised examples options
+dnn_update_egs_opts=(--weight-threshold 0.7 --splice-width 4 --samples-per-iter 200000 --num-jobs-nnet 4 --io-opts "-tc 5" )
+
+if [[ `hostname` == *.tacc.utexas.edu ]] ; then
+  decode_extra_opts=( --num-threads 4 --parallel-opts "-pe smp 4" )
+  sgmm_train_extra_opts=( --num-iters 25 )
+  sgmm_group_extra_opts=( ) 
+  sgmm_denlats_extra_opts=( --num-threads 1 )
+  dnn_denlats_extra_opts=( --num-threads 1 )
+
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
+                         --parallel-opts "-pe smp 16" )
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1
+                         --parallel-opts "-pe smp 16" )
+
+  dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1)
+
+  dnn_update_parallel_opts=( --num-epochs 15 --num-epochs-extra 5 --num-iters-final 20 )
+else
+  decode_extra_opts=(--num-threads 6 --parallel-opts "-pe smp 6 -l mem_free=4G,ram_free=4.0G")
+  sgmm_train_extra_opts=( --num-iters 25 )
+  sgmm_group_extra_opts=(--group 3 --parallel-opts "-pe smp 3 -l mem_free=7G,ram_free=7.0G" --cmd "queue.pl -l arch=*64 -l mem_free=2.0G,ram_free=2.0G") 
+  sgmm_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2.0G")
+  sgmm_mmi_extra_opts=(--cmd "queue.pl -l arch=*64 -l mem_free=1.5G,ram_free=1.5G")
+  dnn_denlats_extra_opts=(--num-threads 4 --parallel-opts "-pe smp 4" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2.0G")
+
+  dnn_cpu_parallel_opts=(--minibatch-size 128 --num-jobs-nnet 8 --num-threads 16 \
+                         --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G")
+  dnn_gpu_parallel_opts=(--minibatch-size 512 --num-jobs-nnet 4 --num-threads 1 \
+                         --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G")
+  dnn_parallel_opts="-l gpu=1"
+  dnn_gpu_mpe_parallel_opts=(--num-jobs-nnet 4 --num-threads 1 \
+                             --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=2G")
+
+  dnn_update_parallel_opts=( --num-epochs 15 --num-epochs-extra 5 --num-iters-final 20 )
+fi
+ 
+icu_transform="Any-Lower"
+case_insensitive=true
+
+
+max_states=150000
+wip=0.5
+
+
+phoneme_mapping=
+
+minimize=true
+
+proxy_phone_beam=-1
+proxy_phone_nbest=-1
+proxy_beam=5
+proxy_nbest=500
+
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
diff --git a/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP b/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP
new file mode 120000
index 00000000000..85955be6954
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/common.semisupervised.limitedLP
@@ -0,0 +1 @@
+../../../babel/s5c/conf/common.semisupervised.limitedLP
\ No newline at end of file
diff --git a/egs/babel_multilang/s5/conf/common_vars.sh b/egs/babel_multilang/s5/conf/common_vars.sh
new file mode 100644
index 00000000000..4c285f60ce5
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/common_vars.sh
@@ -0,0 +1,21 @@
+#keyword search default
+glmFile=conf/glm
+duptime=0.5
+case_insensitive=false
+use_pitch=true
+# Lexicon and Language Model parameters
+oovSymbol="<unk>"
+lexiconFlags="-oov <unk>"
+boost_sil=1.5 #  note from Dan: I expect 1.0 might be better (equivalent to not
+              # having the option)... should test.
+cer=0
+
+#Declaring here to make the definition inside the language conf files more
+# transparent and nice
+declare -A dev10h_more_kwlists
+declare -A dev2h_more_kwlists
+declare -A eval_more_kwlists
+declare -A shadow_more_kwlists
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+[ -f ./cmd.sh ] && . ./cmd.sh; # source train and decode cmds.
diff --git a/egs/babel_multilang/s5/conf/decode.config b/egs/babel_multilang/s5/conf/decode.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/babel_multilang/s5/conf/decode_dnn.config b/egs/babel_multilang/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/babel_multilang/s5/conf/glm b/egs/babel_multilang/s5/conf/glm
new file mode 120000
index 00000000000..54a69f7d856
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/glm
@@ -0,0 +1 @@
+../../../babel/s5c/conf/glm
\ No newline at end of file
diff --git a/egs/babel_multilang/s5/conf/lang b/egs/babel_multilang/s5/conf/lang
new file mode 120000
index 00000000000..efc3224fa69
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/lang
@@ -0,0 +1 @@
+../../../babel/s5c/conf/lang
\ No newline at end of file
diff --git a/egs/babel_multilang/s5/conf/mfcc.conf b/egs/babel_multilang/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..af5f9c081bc
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/mfcc.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+
+
diff --git a/egs/babel_multilang/s5/conf/mfcc_hires.conf b/egs/babel_multilang/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..e7888d44a0b
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/mfcc_hires.conf
@@ -0,0 +1,11 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+
diff --git a/egs/babel_multilang/s5/conf/online_cmvn.conf b/egs/babel_multilang/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/babel_multilang/s5/conf/pitch.conf b/egs/babel_multilang/s5/conf/pitch.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/babel_multilang/s5/conf/plp.conf b/egs/babel_multilang/s5/conf/plp.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/babel_multilang/s5/conf/queue.conf b/egs/babel_multilang/s5/conf/queue.conf
new file mode 100644
index 00000000000..2b2c354d5e2
--- /dev/null
+++ b/egs/babel_multilang/s5/conf/queue.conf
@@ -0,0 +1,10 @@
+# configuration for the AWS cluster for WS'15.
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=1 -q g.q@b* -l gpu=1
diff --git a/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh b/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh
new file mode 100755
index 00000000000..be6a8c700f3
--- /dev/null
+++ b/egs/babel_multilang/s5/local/nnet3/extract_ivector_lang.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# This scripts extract iVector using global iVector extractor
+# trained on all languages in multilingual setup.
+
+. ./cmd.sh
+set -e
+stage=1
+train_set=train
+global_extractor=exp/multi/nnet3/extractor
+ivector_suffix=_gb
+
+[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
+
+. conf/common_vars.sh || exit 1;
+
+[ -f local.conf ] && . ./local.conf
+
+. ./utils/parse_options.sh
+
+lang=$1
+
+mkdir -p nnet3
+
+if [ $stage -le 8 ]; then
+  # We extract iVectors on all the train_nodup data, which will be what we
+  # train the system on.
+  
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$lang/${train_set}_hires data/$lang/${train_set}_max2_hires
+  
+  if [ ! -f exp/$lang/nnet3/ivectors_${train_set}${ivector_suffix}/ivector_online.scp ]; then
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 200 \
+      data/$lang/${train_set}_max2_hires $global_extractor exp/$lang/nnet3/ivectors_${train_set}${ivector_suffix} || exit 1;
+  fi
+
+fi
+
+
+exit 0;
diff --git a/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh b/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh
new file mode 100755
index 00000000000..d53faecee6a
--- /dev/null
+++ b/egs/babel_multilang/s5/local/nnet3/prepare_multilingual_egs.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# This script generates separate egs directory for each input 
+# language in multilingual setup, which contains both egs.*.ark and egs.*.scp.
+#
+# This script will generally be called from nnet training script.
+
+echo "$0 $@"  # Print the command line for logging
+. ./cmd.sh
+set -e
+
+
+# Begin configuration section
+cmd=
+stage=0
+left_context=13
+right_context=9
+online_multi_ivector_dirs=     # list of iVector dir for all languages
+                              # can be used if we are including speaker information as iVectors.
+                              # e.g. "exp/lang1/train-ivector exp/lang2/train-ivector"
+samples_per_iter=400000 # this is the target number of egs in each archive of egs
+                        # (prior to merging egs).  We probably should have called
+                        # it egs_per_iter. This is just a guideline; it will pick
+                        # a number that divides the number of samples in the
+                        # entire data.
+# Configuration to allocate egs
+minibatch_size=512
+num_archives=100
+num_jobs=10
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 4 ]; then
+  echo "Usage: $0 [opts] num-input-langs <data-dir-per-lang> <ali-dir-per-lang> <egs-dir-per-lang> <multilingual-egs-dir>"
+  echo " e.g.: $0 2 data/lang1/train data/lang2/train "
+       " exp/lang1/tri5_ali exp/lang2/tri5_ali exp/lang1/nnet3/egs exp/lang2/nnet3/egs exp/multi/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --num-jobs <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
+  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
+  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  exit 1;
+fi
+
+num_lang=$1
+shift
+args=("$@")
+
+if [ ${#args[@]} != $[$num_lang*3] ]; then
+  echo "$0: num of input dirs provided for all langs is not compatible with num-langs in input." && exit 1;
+fi
+
+# read input data, ali and egs dir per lang
+for l in `seq 0 $[$num_lang-1]`; do
+  multi_data_dirs[$l]=${args[$l]}
+  multi_ali_dirs[$l]=${args[$l+$num_lang]}
+  multi_egs_dirs[$l]=${args[$l+2*$num_lang]}
+done
+
+echo "$0: Generate separate egs directory per language for multilingual training."
+online_multi_ivector_dirs=(${online_multi_ivector_dirs[@]})
+for lang_index in `seq 0 $[$num_lang-1]`; do
+  data=${multi_data_dirs[$lang_index]} 
+  ali_dir=${multi_ali_dirs[$lang_index]}
+  egs_dir=${multi_egs_dirs[$lang_index]}
+  online_ivector_dir=
+  if [ ! -z "${online_multi_ivector_dirs[$lang_index]}" ]; then
+    online_ivector_dir=${online_multi_ivector_dirs[$lang_index]}
+  fi
+  echo online_ivector_dir = $online_ivector_dir 
+  if [ ! -d "$egs_dir" ]; then
+    echo "$0: Generate egs for ${lang_list[$lang_index]}"
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{3,4,5,6}/$USER/kaldi-data/egs/${lang_list[$lang_index]}-$(date +'%m_%d_%H_%M')/s5/$egs_dir/storage $egs_dir/storage
+    fi
+
+    extra_opts=()
+    [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+    [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+    extra_opts+=(--left-context $left_context)
+    extra_opts+=(--right-context $right_context)
+    echo "$0: calling get_egs.sh"
+    steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
+        --samples-per-iter $samples_per_iter --stage $stage \
+        --cmd "$cmd" $egs_opts \
+        --generate-egs-scp true \
+        $data $ali_dir $egs_dir || exit 1;
+    
+  fi
+done
+
diff --git a/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh b/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh
new file mode 100755
index 00000000000..ca9e8517b44
--- /dev/null
+++ b/egs/babel_multilang/s5/local/nnet3/run_common_langs.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# This script uses to generated MFCC+pitch features for input language L.
+
+. ./cmd.sh
+set -e
+stage=1
+train_stage=-10
+generate_alignments=true # If true, it regenerates alignments. 
+speed_perturb=true
+use_pitch=true      # If true, it generates pitch features and combine it with 40dim MFCC.
+pitch_conf=conf/pitch.conf # Configuration used for pitch extraction.
+use_pitch_plp=false # If true, it generated plp+pitch to be used in regenerating alignments.
+
+[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
+
+. conf/common_vars.sh || exit 1;
+
+[ -f local.conf ] && . ./local.conf
+
+. ./utils/parse_options.sh
+
+lang=$1
+
+# perturbed data preparation
+train_set=train
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+    for datadir in train; do
+      ./utils/data/perturb_data_dir_speed_3way.sh data/$lang/${datadir} data/$lang/${datadir}_sp
+
+      # Extract Plp+pitch feature for perturbed data.
+      featdir=plp_perturbed/$lang
+      if $use_pitch_plp; then
+        steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $train_nj  data/$lang/${datadir}_sp exp/$lang/make_plp_pitch/${datadir}_sp $featdir
+      else
+        steps/make_plp.sh --cmd "$train_cmd" --nj $train_nj data/$lang/${datadir}_sp exp/$lang/make_plp/${datadir}_sp $featdir
+      fi
+      steps/compute_cmvn_stats.sh data/$lang/${datadir}_sp exp/$lang/make_plp/${datadir}_sp $featdir || exit 1;
+      utils/fix_data_dir.sh data/$lang/${datadir}_sp
+    done
+  fi
+  
+  train_set=train_sp
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ] && [ ! -f exp/$lang/tri5_ali_sp/.done ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh \
+      --nj 70 --cmd "$train_cmd" \
+      --boost-silence $boost_sil \
+      data/$lang/$train_set data/$lang/lang exp/$lang/tri5 exp/$lang/tri5_ali_sp || exit 1
+    touch exp/$lang/tri5_ali_sp/.done
+  fi
+fi
+
+if [ $stage -le 3 ] && [ ! -f data/$lang/${train_set}_hires/.done ]; then
+  mfccdir=mfcc_hires/$lang
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/$lang-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for dataset in $train_set ; do
+    utils/copy_data_dir.sh data/$lang/$dataset data/$lang/${dataset}_hires
+
+    # scale the waveforms, this is useful as we don't use CMVN
+    data_dir=data/$lang/${dataset}_hires
+
+    utils/data/perturb_data_dir_volume.sh $data_dir || exit 1 ; 
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/$lang/${dataset}_hires exp/$lang/make_hires/$dataset $mfccdir;
+    
+    steps/compute_cmvn_stats.sh data/$lang/${dataset}_hires exp/$lang/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/$lang/${dataset}_hires;
+  done
+  touch data/$lang/${train_set}_hires/.done
+fi
+
+if [ $stage -le 4 ]; then
+  if [[ "$use_pitch" == "true" ]]; then
+    pitchdir=pitch/$lang
+    train_set=${train_set}_hires
+    for dataset in $train_set; do
+      if $use_pitch; then
+        mkdir -p $pitchdir
+        if [ ! -f data/$lang/${dataset}_pitch/feats.scp ]; then
+          echo "$0: Generating pitch features for data/$lang as use_pitch=$use_pitch"
+          utils/copy_data_dir.sh data/$lang/$dataset data/$lang/${dataset}_pitch
+          steps/make_pitch.sh --nj 70 --pitch-config $pitch_conf \
+            --cmd "$train_cmd" data/$lang/${dataset}_pitch exp/$lang/make_pitch/${dataset} $pitchdir;
+        fi
+        feat_suffix=_pitch
+      fi
+
+      if [ ! -f data/$lang/${dataset}_mfcc${feat_suffix}/feats.scp ]; then
+        steps/append_feats.sh --nj 16 --cmd "$train_cmd" data/$lang/${dataset} \
+          data/$lang/${dataset}${feat_suffix} data/$lang/${dataset}_mfcc${feat_suffix} \
+          exp/$lang/append_mfcc${feat_suffix}/${dataset} mfcc${feat_suffix}/$lang
+     
+        steps/compute_cmvn_stats.sh data/$lang/${dataset}_mfcc${feat_suffix} exp/$lang/make_cmvn_mfcc${feat_suffix}/${x} mfcc${feat_suffix}/$lang
+      fi
+    done
+  fi
+fi
+
+exit 0;
diff --git a/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh b/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh
new file mode 100755
index 00000000000..8a896778446
--- /dev/null
+++ b/egs/babel_multilang/s5/local/nnet3/run_multilingual_bnf.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# This script trains a multilingual model using 6 layer TDNN + Xent 
+# with 42 dim bottleneck layer in fifth layer for Georgian.
+# The lang_list contains 10 closest fullLP langs to Georgian + fullLP Georgian.
+# Then it extracts bottleneck features for input language "lang" and 
+# train SAT model using these feautures.
+
+# Copyright 2016  Pegah Ghahremani
+# Apache 2.0
+
+#This yields approx 70 hours of data
+# this script generates bottleneck features from multilingual model
+# trained on list of languages and dump the bnf for specific language L.
+set -e           #Exit on non-zero return code from any command
+set -o pipefail  #Exit if any of the commands in the pipeline will
+                 #return non-zero return code
+. conf/common_vars.sh || exit 1;
+
+set -u           #Fail on an undefined variable
+bnf_train_stage=-100 # the stage variable used in multilingual bottleneck training.
+stage=1
+num_archives=20
+speed_perturb=true
+multidir=exp/nnet3/multi_bnf_10_close_lang_plus_grg
+global_extractor=exp/multi/nnet3/extractor
+lang_list=(404-georgian 403-dholuo 402-javanese 401-mongolian 307-amharic)
+use_flp=true
+
+. ./utils/parse_options.sh
+
+
+lang=$1
+. local/prepare_lang_conf.sh --fullLP $use_flp $lang || exit 1;
+
+if $use_flp; then
+. local/prepare_flp_langconf.sh $lang
+else
+. local/prepare_llp_langconf.sh $lang
+fi
+
+langconf=langconf/$lang/lang.conf
+[ ! -f $langconf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1;
+. $langconf || exit 1;
+
+suffix=
+if $speed_perturb; then
+  suffix=_sp
+fi
+
+exp_dir=exp/$lang
+datadir=data/$lang/train${suffix}_hires_mfcc_pitch
+appended_dir=data/$lang/train${suffix}_hires_mfcc_pitch_bnf
+data_bnf_dir=data/$lang/train${suffix}_bnf
+dump_bnf_dir=bnf/$lang
+ivector_dir=$exp_dir/nnet3/ivectors_train${suffix}_gb
+###############################################################################
+#
+# Training multilingual model with bottleneck layer
+#
+###############################################################################
+mkdir -p $multidir${suffix}
+
+if [ ! -f $multidir${suffix}/.done ]; then 
+  echo "$0: Train multilingual Bottleneck network using lang list = ${lang_list[@]}"
+  ./local/nnet3/run_tdnn_joint_babel_sp_bnf.sh --dir $multidir \
+     --avg-num-archives $num_archives \
+     --global-extractor $global_extractor \
+     --train-stage $bnf_train_stage --stage $stage  || exit 1;
+
+  touch $multidir${suffix}/.done
+else
+  echo "$0 Skip multilingual Bottleneck network training; you can force to run this step by deleting $multidir${suffix}/.done"
+fi
+
+[ ! -d $dump_bnf_dir ] && mkdir -p $dump_bnf_dir
+if [ ! -f $data_bnf_dir/.done ]; then
+  multidir=$multidir${suffix}
+  mkdir -p $dump_bnf_dir
+  # put the archives in ${dump_bnf_dir}/.
+  steps/nnet3/make_bottleneck_features.sh --use-gpu true --nj 70 --cmd "$train_cmd" \
+    --ivector-dir $ivector_dir \
+    --bnf-name Tdnn_Bottleneck_renorm \
+    $datadir $data_bnf_dir \
+    $multidir $dump_bnf_dir $exp_dir/make_train_bnf || exit 1; 
+  touch $data_bnf_dir/.done
+else
+  echo "$0 Skip Bottleneck feature extraction; You can force to run this step deleting $data_bnf_dir/.done."
+fi 
+
+if [ ! -d $appended_dir/.done ]; then
+  steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
+    $data_bnf_dir $datadir $appended_dir \
+    $exp_dir/append_hires_mfcc_bnf $dump_bnf_dir || exit 1; 
+  steps/compute_cmvn_stats.sh $appended_dir \
+    $exp_dir/make_cmvn_mfcc_bnf $dump_bnf_dir || exit 1;
+  touch $appended_dir/.done
+fi
+
+if [ ! $exp_dir/tri5b/.done -nt $data_bnf_dir/.done ]; then
+  steps/train_lda_mllt.sh --splice-opts "--left-context=1 --right-context=1" \
+    --dim 60 --boost-silence $boost_sil --cmd "$train_cmd" \
+    $numLeavesMLLT $numGaussMLLT $appended_dir data/$lang/lang $exp_dir/tri5_ali_sp $exp_dir/tri5b ;
+  touch $exp_dir/tri5b/.done
+fi
+
+if [ ! $exp_dir/tri6/.done -nt $exp_dir/tri5b/.done ]; then
+  steps/train_sat.sh --boost-silence $boost_sil --cmd "$train_cmd" \
+    $numLeavesSAT $numGaussSAT $appended_dir data/$lang/lang \
+    $exp_dir/tri5b $exp_dir/tri6
+  touch $exp_dir/tri6/.done
+fi
+
+echo ---------------------------------------------------------------------
+echo "$0: next, run run-6-bnf-sgmm-semisupervised.sh"
+echo ---------------------------------------------------------------------
+
+exit 0;
diff --git a/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh b/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh
new file mode 100755
index 00000000000..a645b4c2193
--- /dev/null
+++ b/egs/babel_multilang/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+
+# This script can be used for training multilingual setup using different
+# languages (specifically babel languages) with no shared phones.
+# It will generate separate egs directory for each dataset and combine them 
+# during training.
+# In the new multilingual training setup, mini-batches of data corresponding to 
+# different languages are randomly sampled during training based on probability 
+# distribution that reflects the relative frequency of the data from each language.
+
+# For all languages, we share all the hidden layers and there is separate final
+# layer per language.
+# The bottleneck layer can be added to network structure.
+
+# The script requires you to have baseline PLP features for all languages. 
+# It generates 40dim MFCC + pitch features for all languages.
+
+# The global iVector extractor is trained using all languages and the iVector
+# extracts for all languages.
+
+echo "$0 $@"  # Print the command line for logging
+. ./cmd.sh
+set -e
+
+
+cmd=queue.pl
+stage=0
+train_stage=-10
+get_egs_stage=-10
+decode_stage=-10
+num_jobs_initial=2
+num_jobs_final=8
+speed_perturb=true
+use_pitch=true
+global_extractor=exp/multi/nnet3/extractor
+alidir=tri5_ali
+suffix=
+use_ivector=true
+feat_suffix=_hires_mfcc # The feature suffix describing features used in multilingual training
+                        # _hires_mfcc -> 40dim MFCC
+                        # _hire_mfcc_pitch -> 40dim MFCC + pitch
+                        # _hires_mfcc_pitch_bnf -> 40dim MFCC +pitch + BNF
+# corpora
+# language list used for multilingual training
+# The map for lang-name to its abreviation can be find in
+# local/prepare_lang_conf.sh
+# e.g lang_list=(101-cantonese 102-assamese 103-bengali)
+lang_list=
+# The language in this list decodes using Hybrid multilingual system.
+# e.g. decode_lang_list=(101-cantonese)
+decode_lang_list=
+
+dir=exp/nnet3/multi_bnf
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0 0"
+
+ivector_suffix=_gb # if ivector_suffix = _gb, the iVector extracted using global iVector extractor
+                   # trained on pooled data from all languages.
+                   # Otherwise, it uses iVector extracted using local iVector extractor.
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+[ -f local.conf ] && . ./local.conf
+
+num_langs=${#lang_list[@]}
+
+echo "$0 $@"  # Print the command line for logging
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+echo "$0: lang_list = ${lang_list[@]}"
+
+for lang_index in `seq 0 $[$num_langs-1]`; do
+  for f in data/${lang_list[$lang_index]}/train/{feats.scp,text} exp/${lang_list[$lang_index]}/$alidir/ali.1.gz exp/${lang_list[$lang_index]}/$alidir/tree; do
+    [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+  done
+done
+
+if [ "$speed_perturb" == "true" ]; then
+  suffix=${suffix}_sp
+fi
+
+if $use_pitch; then feat_suffix=${feat_suffix}_pitch ; fi
+dir=${dir}${suffix}
+
+
+# extract high resolution MFCC features for speed-perturbed data
+# and extract alignment 
+for lang_index in `seq 0 $[$num_langs-1]`; do
+  echo "$0: extract 40dim MFCC + pitch for speed-perturbed data"
+  local/nnet3/run_common_langs.sh --stage $stage \
+    --speed-perturb $speed_perturb ${lang_list[$lang_index]} || exit;
+done
+
+# we use ivector extractor trained on pooled data from all languages
+# using an LDA+MLLT transform arbitrarily chosen from single language.
+if $use_ivector && [ ! -f $global_extractor/.done ]; then
+  echo "$0: combine training data using all langs for training global i-vector extractor."
+  if [ ! -f data/multi/train${suffix}_hires/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Pooling training data in data/multi${suffix}_hires on" `date`
+    echo ---------------------------------------------------------------------
+    mkdir -p data/multi
+    mkdir -p data/multi/train${suffix}_hires
+    combine_lang_list=""
+    for lang_index in `seq 0 $[$num_langs-1]`;do
+      combine_lang_list="$combine_lang_list data/${lang_list[$lang_index]}/train${suffix}_hires"
+    done
+    utils/combine_data.sh data/multi/train${suffix}_hires $combine_lang_list
+    utils/validate_data_dir.sh --no-feats data/multi/train${suffix}_hires
+    touch data/multi/train${suffix}_hires/.done
+  fi
+
+  echo "$0: Generate global i-vector extractor using data/multi"
+  local/nnet3/run_shared_ivector_extractor.sh --global-extractor $global_extractor \
+    --stage $stage ${lang_list[0]} || exit 1; 
+  touch $global_extractor/.done
+
+  echo "$0: Extract ivector for all languages."
+  for lang_index in `seq 0 $[$num_langs-1]`; do
+    local/nnet3/extract_ivector_lang.sh --stage $stage \
+      --global-extractor $global_extractor \
+      --train-set train$suffix ${lang_list[$lang_index]} || exit;
+  done
+fi
+
+
+# set num_leaves for all languages
+for lang_index in `seq 0 $[$num_langs-1]`; do
+  num_leaves=`tree-info exp/${lang_list[$lang_index]}/$alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1;
+  num_multiple_leaves="$num_multiple_leaves $num_leaves"
+  multi_data_dirs[$lang_index]=data/${lang_list[$lang_index]}/train${suffix}${feat_suffix}
+  multi_egs_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3/egs${ivector_suffix}
+  multi_ali_dirs[$lang_index]=exp/${lang_list[$lang_index]}/tri5_ali${suffix}
+  multi_ivector_dirs[$lang_index]=exp/${lang_list[$lang_index]}/nnet3/ivectors_train${suffix}${ivector_suffix} 
+done
+
+if $use_ivector; then
+  ivector_dim=$(feat-to-dim scp:${multi_ivector_dirs[0]}/ivector_online.scp -) || exit 1;
+  echo ivector-dim = $ivector_dim
+else
+  echo "$0: Not using iVectors in multilingual training."
+  ivector_dim=0
+fi
+
+feat_dim=`feat-to-dim scp:${multi_data_dirs[0]}/feats.scp -`
+
+
+if [ $stage -le 9 ]; then
+  mkdir -p $dir/log
+  echo "$0: creating neural net config for multilingual setups"
+   # create the config files for nnet initialization
+  $cmd $dir/log/make_config.log \
+  python steps/nnet3/tdnn/make_configs.py  \
+    --splice-indexes "$splice_indexes"  \
+    --feat-dim $feat_dim \
+    --ivector-dim $ivector_dim  \
+    --relu-dim 600 \
+    --num-multiple-targets  "$num_multiple_leaves"  \
+    --bottleneck-dim 42 --bottleneck-layer 5 \
+    --use-presoftmax-prior-scale false \
+    --add-lda false \
+   $dir/configs || exit 1;
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: Generate separate egs dir per language for multilingual training."
+  # sourcing the "vars" below sets
+  #model_left_context=(something)
+  #model_right_context=(something)
+  #num_hidden_layers=(something)
+  . $dir/configs/vars || exit 1;
+  
+
+  ivec="${multi_ivector_dirs[@]}"
+  if $use_ivector; then
+    ivector_opts=(--online-multi-ivector-dirs "$ivec")
+  fi
+  local/nnet3/prepare_multilingual_egs.sh --cmd "$decode_cmd" \
+    "${ivector_opts[@]}" \
+    --left-context $model_left_context --right-context $model_right_context \
+    --samples-per-iter 400000 \
+    $num_langs ${multi_data_dirs[@]} ${multi_ali_dirs[@]} ${multi_egs_dirs[@]} || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: training mutilingual model."
+  common_egs_dir="${multi_egs_dirs[@]} $dir/egs"
+  echo common_egs_dir = $common_egs_dir
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --use-dense-target false \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --feat-dir ${multi_data_dirs[0]} \
+    --feat.online-ivector-dir ${multi_ivector_dirs[0]} \
+    --egs.dir "${common_egs_dir[@]}" \
+    --cleanup.remove-egs false \
+    --cleanup.preserve-model-interval 20 \
+    --use-gpu true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+# decoding different languages
+if [ $stage -le 12 ]; then
+  num_decode_lang=${#decode_lang_list[@]}
+  (
+  for lang in `seq 0 $[$num_decode_lang-1]`; do
+    if [ ! -f $dir/${decode_lang_list[$lang]}/decode_dev10h.pem/.done ]; then 
+      cp $dir/cmvn_opts $dir/${decode_lang_list[$lang]}/.
+      echo "Decoding lang ${decode_lang_list[$lang]} using multilingual hybrid model $dir"
+      run-4-anydecode-langs.sh --use-ivector $use_ivector --nnet3-dir $dir ${decode_lang_list[$lang]} || exit 1;
+      touch $dir/${decode_lang_list[$lang]}/decode_dev10h.pem/.done
+    fi
+  done
+  wait
+  )
+fi
diff --git a/egs/babel_multilang/s5/local/prepare_lang_conf.sh b/egs/babel_multilang/s5/local/prepare_lang_conf.sh
new file mode 100755
index 00000000000..18eceaa9403
--- /dev/null
+++ b/egs/babel_multilang/s5/local/prepare_lang_conf.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# This script maps lang-name to its config w.r.t fullLP or limitedLP condition.
+
+fullLP=true
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $(basename $0)  <lang>"
+  echo " e.g.: $(basename $0)  ASM"
+  exit 1
+fi
+
+L=$1
+echo L = $L and fullLP = $fullLP
+if $fullLP; then
+  lang_type=-fullLP
+  lang_type2=.FLP
+else
+  lang_type=-limitedLP
+  lang_type2=.LLP
+fi
+
+case "$L" in
+		101-cantonese)
+			langconf=conf/lang/101-cantonese${lang_type}.official.conf
+			;;
+		102-assamese)			
+			langconf=conf/lang/102-assamese${lang_type}.official.conf
+			;;
+		103-bengali)
+			langconf=conf/lang/103-bengali${lang_type}.official.conf
+			;;
+		104-pashto)
+			langconf=conf/lang/104-pashto${lang_type}.official.conf
+			;;
+		105-turkish)
+			langconf=conf/lang/105-turkish${lang_type}.official.conf	
+			;;
+		106-tagalog)
+			langconf=conf/lang/106-tagalog${lang_type}.official.conf
+			;;
+		107-vietnamese)
+			langconf=conf/lang/107-vietnamese${lang_type}.official.conf
+			;;
+		201-haitian)
+			langconf=conf/lang/201-haitian${lang_type}.official.conf
+			;;
+    202-swahili)
+      langconf=conf/lang/202-swahili${lang_type}.official.conf
+      ;;
+		203-lao)
+			langconf=conf/lang/203-lao${lang_type}.official.conf
+			;;
+		204-tamil)
+			langconf=conf/lang/204-tamil${lang_type}.official.conf
+      ;;
+    205-kurmanji)
+      langconf=conf/lang/205-kurmanji${lang_type2}.official.conf
+      ;;
+		206-zulu)
+			langconf=conf/lang/206-zulu-${lang_type}.official.conf	
+			;;
+    207-tokpisin)
+      langconf=conf/lang/207-tokpisin${lang_type2}.official.conf
+      ;;
+    301-cebuano)
+      langconf=conf/lang/301-cebuano${lang_type2}.official.conf
+      ;;
+    302-kazakh)
+      langconf=conf/lang/302-kazakh${lang_type2}.official.conf
+      ;;
+    303-telugu)
+      langconf=conf/lang/303-telugu${lang_type2}.official.conf
+      ;;
+    304-lithuanian)
+      langconf=conf/lang/304-lithuanian${lang_type2}.official.conf
+			;;
+    305-guarani)
+      langconf=conf/lang/305-guarani${lang_type2}.official.conf
+      ;;
+    306-igbo)
+      langconf=conf/lang/306-igbo${lang_type2}.official.conf
+      ;;
+    307-amharic)
+      langconf=conf/lang/307-amharic${lang_type2}.official.conf
+      ;;
+    401-mongolian)
+      langconf=conf/lang/401-mongolian${lang_type2}.official.conf
+      ;;
+    402-javanese)
+      langconf=conf/lang/402-javanese${lang_type2}.official.conf
+      ;;
+    403-dholuo)
+      langconf=conf/lang/403-dholuo${lang_type2}.official.conf
+      ;;
+    404-georgian)
+      langconf=conf/lang/404-georgian.FLP.official.conf 
+      ;;
+		*)
+			echo "Unknown language code $L." && exit 1
+esac
+
+mkdir -p langconf/$L
+rm -rf langconf/$L/*
+cp $langconf langconf/$L/lang.conf
+
diff --git a/egs/babel_multilang/s5/run-4-anydecode-langs.sh b/egs/babel_multilang/s5/run-4-anydecode-langs.sh
new file mode 100755
index 00000000000..3e13e5eb3a6
Binary files /dev/null and b/egs/babel_multilang/s5/run-4-anydecode-langs.sh differ
diff --git a/egs/babel_multilang/s5/utils b/egs/babel_multilang/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/babel_multilang/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 9b9ce4a54ad..0b85012e7d0 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -26,12 +26,16 @@ def GetSumDescriptor(inputs):
     return sum_descriptors
 
 # adds the input nodes and returns the descriptor
-def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
+def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0, idct_mat = None):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
     output_dim = 0
     components.append('input-node name=input dim=' + str(feat_dim))
-    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
+    prev_layer_output = {'descriptor':  "input",
+                         'dimension': feat_dim}
+    if idct_mat is not None:
+        prev_layer_output = AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, idct_mat)
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
     output_dim += len(splice_indexes) * feat_dim
     if ivector_dim > 0:
         components.append('input-node name=ivector dim=' + str(ivector_dim))
@@ -158,6 +162,11 @@ def AddConvolutionLayer(config_lines, name, input,
     else:
         conv_init_string += " num-filters={0}".format(num_filters)
 
+    if param_stddev is not None:
+        conv_init_string += " param-stddev={0}".format(param_stddev)
+    if bias_stddev is not None:
+        conv_init_string += " bias-stddev={0}".format(bias_stddev)
+
     components.append(conv_init_string)
     component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
 
@@ -448,4 +457,4 @@ def AddBLstmLayer(config_lines,
             'descriptor': output_descriptor,
             'dimension':output_dim
             }
- 
+
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 79bfc25fff6..b00f0d10102 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -39,7 +39,7 @@ num_utts_subset=300     # number of utterances in validation and training
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
 num_train_frames_combine=10000 # # train frames for the above.
 num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
-samples_per_iter=400000 # this is the target number of egs in each archive of egs
+samples_per_iter=40000 # this is the target number of egs in each archive of egs
                         # (prior to merging egs).  We probably should have called
                         # it egs_per_iter. This is just a guideline; it will pick
                         # a number that divides the number of samples in the
@@ -56,6 +56,7 @@ online_ivector_dir=  # can be used if we are including speaker information as iV
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+generate_egs_scp=false # If true, it will generate egs.JOB.*.scp per egs archive
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -294,23 +295,37 @@ if [ $stage -le 3 ]; then
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
+  if $generate_egs_scp; then
+    valid_combine_output="ark,scp:$dir/valid_combine.egs,$dir/valid_combine.egs.scp"
+    valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.egs,$dir/valid_diagnostic.egs.scp"
+    train_combine_output="ark,scp:$dir/train_combine.egs,$dir/train_combine.egs.scp"
+    train_diagnostic_output="ark,scp:$dir/train_diagnostic.egs,$dir/train_diagnostic.egs.scp"
+  else
+    valid_combine_output="ark:$dir/valid_combine.egs"
+    valid_diagnostic_output="ark:$dir/valid_diagnostic.egs"
+    train_combine_output="ark:$dir/train_combine.egs"
+    train_diagnostic_output="ark:$dir/train_diagnostic.egs"
+  fi
   $cmd $dir/log/create_valid_subset_combine.log \
     nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
-    ark:$dir/valid_combine.egs || touch $dir/.error &
+    $valid_combine_output || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
     nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
-    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+    $valid_diagnostic_output || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
     nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
-    ark:$dir/train_combine.egs || touch $dir/.error &
+    $train_combine_output || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
     nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
-    ark:$dir/train_diagnostic.egs || touch $dir/.error &
+    $train_diagnostic_output || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
   cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
-
+  if $generate_egs_scp; then
+    cat $dir/valid_combine.egs.scp $dir/train_combine.egs.scp > $dir/combine.egs.scp
+    rm $dir/{train,valid}_combine.egs.scp
+  fi
   for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
     [ ! -s $f ] && echo "No examples in file $f" && exit 1;
   done
@@ -345,15 +360,32 @@ if [ $stage -le 5 ]; then
   done
 
   if [ $archives_multiple == 1 ]; then # normal case.
+    if $generate_egs_scp; then
+      output_archive="ark,scp:$dir/egs.JOB.ark,$dir/egs.JOB.scp"
+    else
+      output_archive="ark:$dir/egs.JOB.ark"
+    fi
     $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
-      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+      nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" $output_archive  || exit 1;
+    
+    if $generate_egs_scp; then
+      #concatenate egs.JOB.scp in single egs.scp
+      rm -rf $dir/egs.scp
+      for j in $(seq $num_archives_intermediate); do
+        cat $dir/egs.$j.scp || exit 1; 
+      done > $dir/egs.scp || exit 1;
+    fi
   else
     # we need to shuffle the 'intermediate archives' and then split into the
     # final archives.  we create soft links to manage this splitting, because
     # otherwise managing the output names is quite difficult (and we don't want
     # to submit separate queue jobs for each intermediate archive, because then
     # the --max-jobs-run option is hard to enforce).
-    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
+    if $generate_egs_scp; then
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark,scp:$dir/egs.JOB.$y.ark,$dir/egs.JOB.$y.scp; done)"
+    else
+      output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
+    fi
     for x in $(seq $num_archives_intermediate); do
       for y in $(seq $archives_multiple); do
         archive_index=$[($x-1)*$archives_multiple+$y]
@@ -364,8 +396,17 @@ if [ $stage -le 5 ]; then
     $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
       nnet3-shuffle-egs --srand=\$[JOB+$srand] "ark:cat $egs_list|" ark:- \| \
       nnet3-copy-egs ark:- $output_archives || exit 1;
-  fi
 
+    if $generate_egs_scp; then
+      #concatenate egs.JOB.scp in single egs.scp
+      rm -rf $dir/egs.scp
+      for j in $(seq $num_archives_intermediate); do
+        for y in $(seq $num_archives_intermediate); do
+          cat $dir/egs.$j.$y.scp || exit 1; 
+        done
+      done > $dir/egs.scp || exit 1;
+    fi
+  fi
 fi
 
 if [ $stage -le 6 ]; then
diff --git a/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
new file mode 100644
index 00000000000..3c77f0ae00e
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/libs/rnn_train_lib.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+# This is a module with methods which will be used by scripts for training of
+# recurrent neural network acoustic model and raw model (i.e., generic neural
+# network without transition model) with frame-level objectives.
+
+import logging
+import imp
+
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   cache_read_opt, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = nnet3_train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     momentum = momentum, max_param_change = max_param_change,
+                     min_deriv_time = min_deriv_time,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context, min_deriv_time,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      cv_minibatch_size, run_opts,
+                      compute_accuracy = True, get_raw_nnet_from_am = True):
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    nnet3_train_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts,
+                                                mb_size=cv_minibatch_size,
+                                                get_raw_nnet_from_am = get_raw_nnet_from_am,
+                                                compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        nnet3_train_lib.ComputeProgress(dir, iter, egs_dir, run_opts,
+                                        mb_size=cv_minibatch_size,
+                                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just added new hiden layer, don't do
+                           # averaging but take the best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+
+    if do_average:
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        nnet3_train_lib.GetAverageNnetModel(
+                        dir = dir, iter = iter,
+                        nnets_list = " ".join(nnets_list),
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
+
+    else:
+        # choose the best model from different jobs
+        nnet3_train_lib.GetBestNnetModel(
+                        dir = dir, iter = iter,
+                        best_model_index = best_model,
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am,
+                        shrink = shrinkage_value)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    if get_raw_nnet_from_am:
+        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
diff --git a/egs/wsj/s5/steps/nnet3/libs/train_lib.py b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
new file mode 100644
index 00000000000..f1ad2b797e2
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/libs/train_lib.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+# This is a module with methods which will be used by scripts for training of
+# deep neural network acoustic model and raw model (i.e., generic neural
+# network without transition model) with frame-level objectives.
+
+import logging
+import math
+import imp
+import os
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def AddCommonTrainArgs(parser):
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.srand", type=int, dest='srand',
+                        default = 0,
+                        help="Sets the random seed for model initialization and egs shuffling. "
+                        "Warning: This random seed does not control all aspects of this experiment. "
+                        "There might be other random seeds used in other stages of the experiment "
+                        "like data preparation (e.g. volume perturbation).")
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = nnet3_train_lib.NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--egs.cmd", type=str, action = nnet3_train_lib.NullstrToNoneAction,
+                        dest = "egs_command",
+                        help="""Script to launch egs jobs""", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = nnet3_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = nnet3_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = nnet3_train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter MOD preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = nnet3_train_lib.NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, srand, num_jobs,
+                   num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   cache_read_opt, run_opts):
+    # We cannot easily use a single parallel SGE job to do the main training,
+    # because the computation of which archive and which --frame option
+    # to use for each job is a little complex, so we spawn each one separately.
+    # this is no longer true for RNNs as we use do not use the --frame option
+    # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame = (k / num_archives) % frames_per_eg
+
+        cache_write_opt = ""
+        if job == 1:
+            # an option for writing cache (storing pairs of nnet-computations and
+            # computation-requests) during training.
+            cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        egs_for_train_string = nnet3_train_lib.ExampleString(egs_dir, minibatch_size, 
+                               context_opts = context_opts, archive_index = archive_index, 
+                               iter = iter, shuffle_buffer_size = shuffle_buffer_size,
+                               frame = frame)
+
+        process_handle = nnet3_train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=20 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  "{raw_model}" \
+  "{egs_for_train}" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string, 
+                     egs_for_train = egs_for_train_string),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, srand, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts,
+                      compute_accuracy = True, get_raw_nnet_from_am = True):
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    # check if different iterations use the same random seed
+    if os.path.exists('{0}/srand'.format(dir)):
+        try:
+            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
+        except IOError, ValueError:
+            raise Exception('Exception while reading the random seed for training')
+        if srand != saved_srand:
+            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
+    else:
+        f = open('{0}/srand'.format(dir), 'w')
+        f.write(str(srand))
+        f.close()
+
+    nnet3_train_lib.ComputeTrainCvProbabilities(
+                    dir, iter, egs_dir, run_opts,
+                    get_raw_nnet_from_am = get_raw_nnet_from_am,
+                    compute_accuracy = compute_accuracy)
+
+    if iter > 0:
+        nnet3_train_lib.ComputeProgress(
+                        dir, iter, egs_dir, run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just added new hiden layer, don't do
+                           # averaging but take the best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        if get_raw_nnet_from_am:
+            raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        else:
+            raw_model_string = "nnet3-copy --learning-rate={lr} {dir}/{iter}.raw - |".format(lr = learning_rate, dir = dir, iter = iter)
+
+    if do_average:
+        cur_minibatch_size = minibatch_size
+        cur_max_param_change = max_param_change
+    else:
+        # on iteration zero or when we just added a layer, use a smaller minibatch
+        # size (and we will later choose the output of just one of the jobs): the
+        # model-averaging isn't always helpful when the model is changing too fast
+        # (i.e. it can worsen the objective function), and the smaller minibatch
+        # size will help to keep the update stable.
+        cur_minibatch_size = minibatch_size / 2
+        cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = nnet3_train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+        nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        nnet3_train_lib.GetAverageNnetModel(
+                        dir = dir, iter = iter,
+                        nnets_list = " ".join(nnets_list),
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+    else:
+        # choose the best model from different jobs
+        nnet3_train_lib.GetBestNnetModel(
+                        dir = dir, iter = iter,
+                        best_model_index = best_model,
+                        run_opts = run_opts,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    if get_raw_nnet_from_am:
+        new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+    else:
+        new_model = "{0}/{1}.raw".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 53739f0f9ce..996d64eef2e 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -50,6 +50,19 @@ def GetArgs():
                         default=0.0)
     parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="If \"true\" an LDA matrix computed from the input features "
+                        "(spliced according to the first set of splice-indexes) will be used as "
+                        "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "This variable needs to be set to \"false\" when using dense-targets "
+                        "or when --add-idct is set to \"true\".",
+                        default=True, choices = ["false", "true"])
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a sigmoid layer as the final layer. Applicable only if skip-final-softmax is true.",
+                        choices=['true', 'false'], default = False)
+    parser.add_argument("--objective-type", type=str, default="linear",
+                        choices = ["linear", "quadratic"],
+                        help = "the type of objective; i.e. quadratic or linear")
 
     # LSTM options
     parser.add_argument("--num-lstm-layers", type=int,
@@ -86,6 +99,16 @@ def GetArgs():
     parser.add_argument("--lstm-delay", type=str, default=None,
                         help="option to have different delays in recurrence for each lstm")
 
+    # Options to convert input MFCC into Fbank features. This is useful when a
+    # LDA layer is not added (such as when using dense targets)
+    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank",
+                        default = False, choices = ["true", "false"])
+
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
 
@@ -115,6 +138,9 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
     if not args.num_targets > 0:
         print(args.num_targets)
         raise Exception("num_targets has to be positive")
@@ -208,28 +234,39 @@ def ParseLstmDelayString(lstm_delay):
     return lstm_delay_array
 
 
-def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets, add_lda,
+                add_idct, cepstral_lifter,
                 splice_indexes, lstm_delay, cell_dim, hidden_dim,
                 recurrent_projection_dim, non_recurrent_projection_dim,
                 num_lstm_layers, num_hidden_layers,
                 norm_based_clipping, clipping_threshold,
                 ng_per_element_scale_options, ng_affine_options,
-                label_delay, include_log_softmax, xent_regularize,
+                label_delay, include_log_softmax, add_final_sigmoid,
+                objective_type, xent_regularize,
                 self_repair_scale_nonlinearity, self_repair_scale_clipgradient):
 
     config_lines = {'components':[], 'component-nodes':[]}
 
+    if add_idct:
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if add_idct else None)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output, label_delay = label_delay, objective_type = objective_type)
     config_files[config_dir + '/init.config'] = init_config_lines
 
-    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+    # add_lda needs to be set "false" when using dense targets,
+    # or if the task is not a simple classification task
+    # (e.g. regression, multi-task)
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
 
     for i in range(num_lstm_layers):
         if len(lstm_delay[i]) == 2: # add a bi-directional LSTM layer
@@ -248,7 +285,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                                                    lstm_delay = lstm_delay[i][0], self_repair_scale_nonlinearity = self_repair_scale_nonlinearity, self_repair_scale_clipgradient = self_repair_scale_clipgradient)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
 
         if xent_regularize != 0.0:
@@ -265,7 +302,7 @@ def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
                                                ng_affine_options, self_repair_scale = self_repair_scale_nonlinearity)
         # make the intermediate config file for layerwise discriminative
         # training
-        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax, add_final_sigmoid = add_final_sigmoid, objective_type = objective_type)
 
         if xent_regularize != 0.0:
             nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
@@ -293,14 +330,6 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer
     if (num_hidden_layers < num_lstm_layers):
         raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
 
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    f.close()
-
     return [left_context, right_context, num_hidden_layers, splice_indexes]
 
 
@@ -308,9 +337,22 @@ def Main():
     args = GetArgs()
     [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
 
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(args.config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('num_targets=' + str(args.num_targets), file=f)
+    print('objective_type=' + str(args.objective_type), file=f)
+    print('add_lda=' + ("true" if args.add_lda else "false"), file=f)
+    print('include_log_softmax=' + ("true" if args.include_log_softmax else "false"), file=f)
+    f.close()
+
     MakeConfigs(config_dir = args.config_dir,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
+                add_lda = args.add_lda,
+                add_idct = args.add_idct, cepstral_lifter = args.cepstral_lifter,
                 splice_indexes = splice_indexes, lstm_delay = args.lstm_delay,
                 cell_dim = args.cell_dim,
                 hidden_dim = args.hidden_dim,
@@ -324,6 +366,8 @@ def Main():
                 ng_affine_options = args.ng_affine_options,
                 label_delay = args.label_delay,
                 include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                objective_type = args.objective_type,
                 xent_regularize = args.xent_regularize,
                 self_repair_scale_nonlinearity = args.self_repair_scale_nonlinearity,
                 self_repair_scale_clipgradient = args.self_repair_scale_clipgradient)
diff --git a/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh
new file mode 100755
index 00000000000..7af10014f2c
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+#           2016  Pegah Ghahremani
+# Apache 2.0
+# This script dumps bottleneck feature for model trained using nnet3.
+
+# Begin configuration section.
+stage=1
+nj=4
+cmd=run.pl
+use_gpu=false
+ivector_dir=
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "usage: steps/nnet3/dump_bottleneck_features.sh <input-data-dir> <output-data-dir> <bnf-nnet-dir> <archive-dir> <log-dir>"
+   echo "e.g.:  steps/nnet3/dump_bottleneck_features.sh data/train data/train_bnf exp/nnet3/tdnn_bnf bnf exp_bnf/dump_bnf"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --ivector-dir                                    # directory for ivectors"
+   exit 1;
+fi
+
+data=$1
+bnf_data=$2
+nnetdir=$3
+archivedir=$4
+dir=$5
+
+# Assume that final.nnet is in nnetdir
+cmvn_opts=`cat $nnetdir/cmvn_opts`;
+bnf_nnet=$nnetdir/final.raw
+node_name=Tdnn_Bottleneck_renorm
+if [ ! -f $bnf_nnet ] ; then
+  echo "No such file $bnf_nnet";
+  exit 1;
+fi
+
+if $use_gpu; then
+  compute_queue_opt="--gpu 1"
+  compute_gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  compute_gpu_opt="--use-gpu=no"
+fi
+
+
+## Set up input features of nnet
+name=`basename $data`
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+mkdir -p $bnf_data
+echo $nj > $nnetdir/num_jobs
+splice_opts=`cat $nnetdir/splice_opts 2>/dev/null`
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+if [ "$ivector_dir" != "" ];then 
+  use_ivector=true
+fi
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+ivec_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |"
+
+if [ $stage -le 1 ]; then
+  echo "$0: Generating bottle-neck features"
+  echo output-node name=output input=$node_name > output.config
+  modified_bnf_nnet="nnet3-copy --nnet-config=output.config $bnf_nnet - |"
+  ivector_opts=
+  if $use_ivector; then
+    ivec_period=`grep ivector-period $ivector_dir/conf/ivector_extractor.conf  | cut -d"=" -f2` 
+    ivector_opts="--online-ivector-period=$ivec_period --online-ivectors='$ivec_feats'"
+  fi
+  $cmd $compute_queue_opt JOB=1:$nj $dir/log/make_bnf_$name.JOB.log \
+    nnet3-compute $compute_gpu_opt $ivector_opts "$modified_bnf_nnet" "$feats" ark:- \| \
+    copy-feats ark:- ark,scp:$archivedir/raw_bnfeat_$name.JOB.ark,$archivedir/raw_bnfeat_$name.JOB.scp || exit 1;
+fi
+
+rm $dir/trans.ark 2>/dev/null
+
+N0=$(cat $data/feats.scp | wc -l)
+N1=$(cat $archivedir/raw_bnfeat_$name.*.scp | wc -l)
+if [[ "$N0" != "$N1" ]]; then
+  echo "Error happens when generating BNF for $name (Original:$N0  BNF:$N1)"
+  exit 1;
+fi
+
+# Concatenate feats.scp into bnf_data
+for n in $(seq $nj); do  cat $archivedir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp
+
+for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
+  [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
+done
+
+echo "$0: computing CMVN stats."
+steps/compute_cmvn_stats.sh $bnf_data $dir $archivedir
+
+echo "$0: done making BNF feats.scp."
+
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
new file mode 100644
index 00000000000..ca068c7e6c5
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python
+
+# This script generates egs.Archive.scp and ranges.* used for generating egs.Archive.scp 
+# for multilingual setup.
+# Also this script generates outputs.*.scp and weight.*.scp, where each line 
+# corresponds to language-id and weight for the same example in egs.*.scp. 
+# weight.*.scp used to scale the output's posterior during training.
+# ranges.*.scp is generated w.r.t frequency distribution of remaining examples 
+# in each language. 
+#
+# You call this script as (e.g.)
+#
+# allocate_multilingual_examples.py [opts] num-of-languages example-scp-lists multilingual-egs-dir
+#
+# allocate_multilingual_examples.py --num-jobs 10 --samples-per-iter 10000 --minibatch-size 512
+# --lang2weight exp/multi/lang2weight 2 "exp/lang1/egs.scp exp/lang2/egs.scp" 
+# exp/multi/egs
+#
+# This script outputs specific ranges.* files to the temp directory (exp/multi/egs/temp)
+# that will enable you to creat egs.*.scp files for multilingual training.
+# exp/multi/egs/temp/ranges.* contains something like the following:
+# e.g.
+# lang1 0 0 256
+# lang2 1 256 256
+#
+# where each line can be interpreted as follows:
+# <source-language> <local-scp-line> <num-examples>
+#
+# note that <local-scp-line> is the zero-based line number in egs.scp for 
+# that language.
+# num-examples is multiple of actual minibatch-size.
+#
+#
+# egs.1.scp is generated using ranges.1.scp as following:
+# "num_examples" consecutive examples starting from line "local-scp-line" from 
+# egs.scp file for language "source-lang" is copied to egs.1.scp.
+#
+#
+
+from __future__ import print_function
+import re, os, argparse, sys, math, warnings, random, io, imp
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+def GetArgs():
+
+  parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and weights.* files "
+                                   "in preparation for dumping egs for multilingual training.",
+                                   epilog="Called by steps/nnet3/multilingual/get_egs.sh")
+  parser.add_argument("--samples-per-iter", type=int, default=40000,
+                      help="The target number of egs in each archive of egs, "
+                      "(prior to merging egs). ");
+  parser.add_argument("--num-jobs", type=int, default=20,
+                      help="This can be used for better randomness in distributing languages across archives."
+                      ", where egs.job.archive.scp generated randomly and examples are combined "
+                      " across all jobs as eg.archive.scp.")
+  parser.add_argument("--random-lang", type=str, action=nnet3_train_lib.StrToBoolAction, 
+                      help="If true, the lang-id in ranges.* selected"
+                      " w.r.t frequency distribution of remaining examples in each language,"
+                      " otherwise it is selected sequentially.",
+                      default=True, choices = ["false", "true"])
+  parser.add_argument("--max-archives", type=int, default=1000,
+                      help="max number of archives used to generate egs.*.scp");
+  parser.add_argument("--seed", type=int, default=1,
+                      help="Seed for random number generator")
+
+  parser.add_argument("--minibatch-size", type=int, default=512,
+                      help="The minibatch size used to generate scp files per job. "
+                           "It should be multiple of actual minibatch size.");
+  
+  parser.add_argument("--prefix", type=str, default="",
+                      help="Adds a prefix to the range files. This is used to distinguish between the train "
+                      "and diagnostic files.")
+
+  parser.add_argument("--lang2weight", type=str, 
+                      help="lang2weight file contains the weight per language to scale output posterior for that language.(format is: "
+                           "<lang-id> <weight>)");
+# now the positional arguments
+  parser.add_argument("num_langs", type=int,
+                      help="num of languages used in multilingual training setup.");
+  parser.add_argument("egs_scp_lists", type=str,
+                      help="list of egs.scp files per input language."
+                           "e.g. exp/lang1/egs/egs.scp exp/lang2/egs/egs.scp");
+
+  parser.add_argument("egs_dir",
+                      help="Name of egs directory e.g. exp/multilingual_a/egs");
+
+
+  print(' '.join(sys.argv))
+
+  args = parser.parse_args()
+ 
+  return args
+
+
+# Returns a random language number w.r.t 
+# amount of examples in each language.
+# It works based on sampling from a 
+# discrete distribution, where it returns i 
+# with prob(i) as (num_egs in lang(i)/ tot_egs).
+# tot_egs is sum of lang_len.
+def RandomLang(lang_len, tot_egs, random_selection):
+  assert(tot_egs > 0)
+  rand_int = random.randint(0, tot_egs - 1)
+  count = 0
+  for l in range(len(lang_len)):
+    if random_selection:
+      if rand_int > count and rand_int <= (count + lang_len[l]):
+        rand_lang = l
+        break
+      else:
+        count += lang_len[l]
+    else: 
+      if (lang_len[l] > 0):
+        rand_lang = l
+        break
+  assert(rand_lang >= 0 and rand_lang < len(lang_len))
+  return rand_lang
+
+# Read lang2weight file and return lang2weight array
+# where lang2weight[i] is weight for language i.
+def ReadLang2weight(lang2w_file):
+  f = open(lang2w_file, "r");
+  if f is None:
+    raise Exception("Error opening lang2weight file " + str(lang2w_file))
+  lang2w = []
+  for line in f:
+    a = line.split()
+    if len(a) != 2:
+      raise Exception("bad line in lang2weight file " + line)
+    lang2w.append(int(a[1]))
+  f.close()
+  return lang2w
+
+# struct to keep archives correspond to each job
+class ArchiveToJob():
+  def __init__(self, job_id, archives_for_job):
+    self.job_id = job_id
+    self.archives = archives_for_job
+
+def Main():
+  args = GetArgs()
+  random.seed(args.seed)
+  num_langs = args.num_langs 
+  rand_select = args.random_lang
+
+  # read egs.scp for input languages
+  scp_lists = args.egs_scp_lists.split();
+  assert(len(scp_lists) == num_langs);
+ 
+  scp_files = [open(scp_lists[lang], 'r') for lang in range(num_langs)]
+  
+  # computes lang2len, where lang2len[i] shows number of 
+  # examples for language i.
+  lang2len = [0] * num_langs
+  for lang in range(num_langs):
+    lang2len[lang] = sum(1 for line in open(scp_lists[lang]))
+    print("Number of examples for language {0} is {1}".format(lang, lang2len[lang]))
+
+  # If weights are not provided, the scaling weights
+  # are one.
+  if args.lang2weight is None:
+    lang2weight = [ 1.0 ] * num_langs
+  else:
+    lang2weight = ReadLang2Len(args.lang2weight)
+    assert(len(lang2weight) == num_langs)
+
+  if not os.path.exists(args.egs_dir + "/temp"):
+    os.makedirs(args.egs_dir + "/temp")
+
+  num_lang_file = open(args.egs_dir + "/info/" + args.prefix + "num_lang", "w");
+  print("{0}".format(num_langs), file = num_lang_file) 
+
+
+  # Each element of all_egs (one per num_archive * num_jobs) is
+  # an array of 3-tuples (lang-id, local-start-egs-line, num-egs)
+  all_egs = []
+  lang_len = lang2len[:]
+  tot_num_egs = sum(lang2len[i] for i in range(len(lang2len))) # total num of egs in all languages
+  num_archives = max(1, min(args.max_archives, tot_num_egs / args.samples_per_iter))
+  
+
+  num_arch_file = open(args.egs_dir + "/info/" + args.prefix + "num_archives", "w");
+  print("{0}".format(num_archives), file = num_arch_file)
+  num_arch_file.close()
+
+  this_num_egs_per_archive = tot_num_egs / (num_archives * args.num_jobs) # num of egs per archive
+  for job_index in range(args.num_jobs):
+    for archive_index in range(num_archives):
+      # Temporary scp.job_index.archive_index files to store egs.scp correspond to each archive.
+      print("Processing archive {0} for job {1}".format(archive_index + 1, job_index + 1))
+      archfile = open(args.egs_dir + "/temp/" + args.prefix + "scp." + str(job_index + 1) + "." + str(archive_index + 1), "w")
+      
+      this_egs = [] # this will be array of 2-tuples (lang-id start-frame num-frames)
+      
+      num_egs = 0
+      while num_egs <= this_num_egs_per_archive:
+        rem_egs = sum(lang_len[i] for i in range(len(lang_len)))
+        if rem_egs > 0:
+          lang_id = RandomLang(lang_len, rem_egs, rand_select)
+          start_egs = lang2len[lang_id] - lang_len[lang_id]
+          this_egs.append((lang_id, start_egs, args.minibatch_size))
+          for scpline in range(args.minibatch_size):
+            print("{0} {1}".format(scp_files[lang_id].readline().splitlines()[0], lang_id), file = archfile)
+
+          lang_len[lang_id] = lang_len[lang_id] - args.minibatch_size
+          num_egs = num_egs + args.minibatch_size;
+          # If the num of remaining egs in each lang is less than minibatch_size,
+          # they are discarded.
+          if lang_len[lang_id] < args.minibatch_size:
+            lang_len[lang_id] = 0
+            print("Run out of data for language {0}".format(lang_id))
+        else:
+          print("Run out of data for all languages.")
+          break
+      all_egs.append(this_egs)
+      archfile.close()
+
+  # combine examples across all jobs correspond to each archive.
+  for archive in range(num_archives):
+    print("Processing archive {0} by combining all jobs.".format(archive + 1)) 
+    this_ranges = []
+    f = open(args.egs_dir + "/temp/" + args.prefix + "ranges." + str(archive + 1), "w")
+    o = open(args.egs_dir + "/" + args.prefix + "output." + str(archive + 1), "w")
+    w = open(args.egs_dir + "/" + args.prefix + "weight." + str(archive + 1), "w") 
+    scp_per_archive_file = open(args.egs_dir + "/" + args.prefix + "egs." + str(archive + 1), "w")
+
+    # check files befor writing.
+    if f is None:
+      raise Exception("Error opening file " + args.egs_dir + "/temp/" + args.prefix + "ranges." + str(job + 1))
+    if o is None:
+      raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "output." + str(job + 1))
+    if w is None:
+      raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "weight." + str(job + 1))
+    if scp_per_archive_file is None:
+      raise Exception("Error opening file " + args.egs_dir + "/" + args.prefix + "egs." + str(archive + 1), "w")
+
+    for job in range(args.num_jobs):
+      # combine egs.job.archive.scp across all jobs.
+      scp = args.egs_dir + "/temp/" + args.prefix + "scp." + str(job + 1) + "." + str(archive + 1)
+      with open(scp,"r") as scpfile:
+        for line in scpfile:
+          scp_line = line.splitlines()[0].split()
+          print("{0} {1}".format(scp_line[0], scp_line[1]), file=scp_per_archive_file)
+          print("{0} output-{1}".format(scp_line[0], scp_line[2]), file=o)
+          print("{0} {1}".format(scp_line[0], lang2weight[int(scp_line[2])]), file=w) 
+      os.remove(scp)
+
+      # combine ranges.* across all jobs for archive
+      for (lang_id, start_eg_line, num_egs) in all_egs[num_archives * job + archive]:
+        this_ranges.append((lang_id, start_eg_line, num_egs))
+
+    # write ranges.archive
+    for (lang_id, start_eg_line, num_egs) in this_ranges:
+      print("{0} {1} {2}".format(lang_id, start_eg_line, num_egs), file=f)
+    
+    scp_per_archive_file.close()
+    f.close()
+    o.close()
+    w.close()
+  print("allocate_multilingual_examples.py finished generating " + args.prefix + "egs.*.scp and " + args.prefix +  "ranges.* and " + args.prefix + "output.*" + args.prefix + "weight.* files")
+
+if __name__ == "__main__":
+  Main()
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh
new file mode 100755
index 00000000000..f97be948c1f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/multilingual/get_egs.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+#
+# This script uses separate input egs directory for each language as input, 
+# to generate egs.*.scp files in multilingual egs directory
+# where the scp line points to the original archive for each egs directory.
+# $megs/egs.*.scp is randomized w.r.t language id.
+#
+# Also this script generates egs.JOB.scp, output.JOB.scp and weight.JOB.scp,
+# where output file contains language-id for each example
+# and weight file contains weights for scaling output posterior 
+# for each example w.r.t input language.
+#
+# Begin configuration section.
+cmd=run.pl
+minibatch_size=512      # multiple of minibatch used during training.
+num_jobs=10             # This can be set to max number of jobs to run in parallel;
+                        # Helps for better randomness across languages
+                        # per archive.
+samples_per_iter=400000 # this is the target number of egs in each archive of egs
+                        # (prior to merging egs).  We probably should have called
+                        # it egs_per_iter. This is just a guideline; it will pick
+                        # a number that divides the number of samples in the
+                        # entire data.
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+num_langs=$1
+shift 1
+args=("$@")
+megs_dir=${args[-1]} # multilingual directory
+mkdir -p $megs_dir
+mkdir -p $megs_dir/info
+
+if [ ${#args[@]} != $[$num_langs+1] ]; then
+  echo "$0: Number of input example dirs provided is not compatible with num_langs $num_langs."
+  echo "Usage:$0 [opts] <num-input-langs,N> <lang1-egs-dir> ...<langN-egs-dir> <multilingual-egs-dir>"
+  echo "Usage:$0 [opts] 2 exp/lang1/egs exp/lang2/egs exp/multi/egs"
+  exit 1;
+fi
+
+required_files="egs.scp combine.egs.scp train_diagnostic.egs.scp valid_diagnostic.egs.scp"
+train_scp_list=
+train_diagnostic_scp_list=
+valid_diagnostic_scp_list=
+combine_scp_list=
+
+# copy paramters from $egs_dir[0]/info
+# into multilingual dir egs_dir/info
+
+params_to_check="feat_dim ivector_dim left_context right_context frames_per_eg"
+for param in $params_to_check; do
+  cat ${args[0]}/info/$param > $megs_dir/info/$param || exit 1;
+done
+
+for lang in $(seq 0 $[$num_langs-1]);do
+  multi_egs_dir[$lang]=${args[$lang]}
+  echo "arg[$lang] = ${args[$lang]}"
+  for f in $required_files; do
+    if [ ! -f ${multi_egs_dir[$lang]}/$f ]; then
+      echo "$0: no such a file ${multi_egs_dir[$lang]}/$f." && exit 1;
+    fi
+  done
+  train_scp_list="$train_scp_list ${args[$lang]}/egs.scp"
+  train_diagnostic_scp_list="$train_diagnostic_scp_list ${args[$lang]}/train_diagnostic.egs.scp"
+  valid_diagnostic_scp_list="$valid_diagnostic_scp_list ${args[$lang]}/valid_diagnostic.egs.scp"
+  combine_scp_list="$combine_scp_list ${args[$lang]}/combine.egs.scp"
+
+  # check parameter dimension to be the same in all egs dirs
+  for f in $params_to_check; do
+    f1=`cat $megs_dir/info/$param`;
+    f2=`cat ${multi_egs_dir[$lang]}/info/$f`;
+    if [ $f1 != $f1 ]; then
+      echo "$0: mismatch in dimension for $f parameter in ${multi_egs_dir[$lang]}." 
+      exit 1;
+    fi
+  done
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0: allocating multilingual examples for training."
+  # Generate egs.*.scp for multilingual setup.
+  $cmd $megs_dir/log/allocate_multilingual_examples_train.log \
+  python steps/nnet3/multilingual/allocate_multilingual_examples.py \
+      --minibatch-size $minibatch_size \
+      --samples-per-iter $samples_per_iter \
+      $num_langs "$train_scp_list" $megs_dir || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: combine combine.egs.scp examples from all langs in $megs_dir/combine.egs.scp."
+  # Generate combine.egs.scp for multilingual setup. 
+  $cmd $megs_dir/log/allocate_multilingual_examples_combine.log \
+  python steps/nnet3/multilingual/allocate_multilingual_examples.py \
+      --random-lang false \
+      --max-archives 1 --num-jobs 1 \
+      --minibatch-size $minibatch_size \
+      --prefix "combine." \
+      $num_langs "$combine_scp_list" $megs_dir || exit 1;
+  
+  echo "$0: combine train_diagnostic.egs.scp examples from all langs in $megs_dir/train_diagnostic.egs.scp."
+  # Generate train_diagnostic.egs.scp for multilingual setup. 
+  $cmd $megs_dir/log/allocate_multilingual_examples_train_diagnostic.log \
+  python steps/nnet3/multilingual/allocate_multilingual_examples.py \
+      --random-lang false \
+      --max-archives 1 --num-jobs 1 \
+      --minibatch-size $minibatch_size \
+      --prefix "train_diagnostic." \
+      $num_langs "$train_diagnostic_scp_list" $megs_dir || exit 1;
+
+      
+  echo "$0: combine valid_diagnostic.egs.scp examples from all langs in $megs_dir/valid_diagnostic.egs.scp."
+  # Generate valid_diagnostic.egs.scp for multilingual setup. 
+  $cmd $megs_dir/log/allocate_multilingual_examples_valid_diagnostic.log \
+  python steps/nnet3/multilingual/allocate_multilingual_examples.py \
+      --random-lang false --max-archives 1 --num-jobs 1\
+      --minibatch-size $minibatch_size \
+      --prefix "valid_diagnostic." \
+      $num_langs "$valid_diagnostic_scp_list" $megs_dir || exit 1;
+   
+fi
+
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py
new file mode 100755
index 00000000000..9ed80afd1eb
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/multilingual/make_tdnn_configs.py
@@ -0,0 +1,555 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import shlex
+import sys
+import warnings
+import copy
+import imp
+import ast
+
+nodes = imp.load_source('', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training for "
+                                                 "multilingaul system with multiple output "
+                                                 "and bottleneck layer",
+                                     epilog="See egs/babel_multilingual/s5/local/nnet3/run_tdnn_joint_babel_sp_bnf.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+    num_target_group.add_argument("--num-multiple-targets", type=str,
+                        help="space separated number of network targets for different languages(e.g. num-pdf-ids/num-leaves e.g. '1000 2000 3000')")
+
+    # CNN options
+    parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
+                        help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 "
+                        "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 "
+                        "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, "
+                        "when CNN layers are used, no LDA will be added", default = None)
+    parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim",
+                        help="Output dimension of the linear layer at the CNN output "
+                        "for dimension reduction, e.g. 256."
+                        "The default zero means this layer is not needed.", default=0)
+    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str, required = True,
+                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' "
+                        "If CNN layers are used the first set of splice indexes will be used as input "
+                        "to the first CNN layer and later splice indexes will be interpreted as indexes "
+                        "for the TDNNs.")
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="If \"true\" an LDA matrix computed from the input features "
+                        "(spliced according to the first set of splice-indexes) will be used as "
+                        "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "If --cnn.layer is specified this option will be forced to \"false\".",
+                        default=False, choices = ["false", "true"])
+
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a final sigmoid layer as alternate to log-softmax-layer. "
+                        "Can only be used if include-log-softmax is false. "
+                        "This is useful in cases where you want the output to be "
+                        "like probabilities between 0 and 1. Typically the nnet "
+                        "is trained with an objective such as quadratic",
+                        default=False, choices = ["false", "true"])
+
+    parser.add_argument("--objective-type", type=str,
+                        help = "the type of objective; i.e. quadratic or linear",
+                        default="linear", choices = ["linear", "quadratic"])
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                        default=False, choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+    parser.add_argument("--pnorm-input-dim", type=int,
+                        help="input dimension to p-norm nonlinearities")
+    parser.add_argument("--pnorm-output-dim", type=int,
+                        help="output dimension of p-norm nonlinearities")
+    parser.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+
+    parser.add_argument("--self-repair-scale-nonlinearity", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = False)
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+    # multilingual tdnn with bn layer config
+    parser.add_argument("--bottleneck-layer", type=int,
+                        help="The layer number to add bottleneck layer,"
+                        "if < 0, means this layer is not needed in network.",
+                        default=-1)
+    parser.add_argument("--bottleneck-dim", type=int,
+                        help="The bottleneck layer dimension in TDNN network e.g. 42.",
+                        default=40)
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if not args.num_targets > 0:
+        if args.num_multiple_targets is None:
+          print(args.num_targets)
+          raise Exception("num_targets or num_multiple_targets has to be positive")
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.subset_dim < 0):
+        raise Exception("--subset-dim has to be non-negative")
+
+    if not args.relu_dim is None:
+        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
+            raise Exception("--relu-dim argument not compatible with "
+                            "--pnorm-input-dim or --pnorm-output-dim options");
+        args.nonlin_input_dim = args.relu_dim
+        args.nonlin_output_dim = args.relu_dim
+        args.nonlin_type = 'relu'
+    else:
+        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
+            raise Exception("--relu-dim not set, so expected --pnorm-input-dim and "
+                            "--pnorm-output-dim to be provided.");
+        args.nonlin_input_dim = args.pnorm_input_dim
+        args.nonlin_output_dim = args.pnorm_output_dim
+        if (args.nonlin_input_dim < args.nonlin_output_dim) or (args.nonlin_input_dim % args.nonlin_output_dim != 0):
+            raise Exception("Invalid --pnorm-input-dim {0} and --pnorm-output-dim {1}".format(args.nonlin_input_dim, args.nonlin_output_dim))
+        args.nonlin_type = 'pnorm'
+
+    if args.add_final_sigmoid and args.include_log_softmax:
+        raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.")
+
+    if args.xent_separate_forward_affine and args.add_final_sigmoid:
+        raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true")
+
+    if args.add_lda and args.cnn_layer is not None:
+        args.add_lda = False
+        warnings.warn("--add-lda is set to false as CNN layers are used.")
+
+    return args
+
+def AddConvMaxpLayer(config_lines, name, input, args):
+    if '3d-dim' not in input:
+        raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.")
+
+    input = nodes.AddConvolutionLayer(config_lines, name, input,
+                              input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                              args.filt_x_dim, args.filt_y_dim,
+                              args.filt_x_step, args.filt_y_step,
+                              args.num_filters, input['vectorization'])
+
+    if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1:
+      input = nodes.AddMaxpoolingLayer(config_lines, name, input,
+                                input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                                args.pool_x_size, args.pool_y_size, args.pool_z_size,
+                                args.pool_x_step, args.pool_y_step, args.pool_z_step)
+
+    return input
+
+# The ivectors are processed through an affine layer parallel to the CNN layers,
+# then concatenated with the CNN output and passed to the deeper part of the network.
+def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+    cnn_args = ParseCnnString(cnn_layer)
+    num_cnn_layers = len(cnn_args)
+    # We use an Idct layer here to convert MFCC to FBANK features
+    nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+    prev_layer_output = {'descriptor':  "input",
+                         'dimension': feat_dim}
+    prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
+
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    splice_descriptor = "Append({0})".format(", ".join(list))
+    cnn_input_dim = len(splice_indexes) * feat_dim
+    prev_layer_output = {'descriptor':  splice_descriptor,
+                         'dimension': cnn_input_dim,
+                         '3d-dim': [len(splice_indexes), feat_dim, 1],
+                         'vectorization': 'yzx'}
+
+    for cl in range(0, num_cnn_layers):
+        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
+
+    if cnn_bottleneck_dim > 0:
+        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+
+    if ivector_dim > 0:
+        iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
+                           'dimension': ivector_dim}
+        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
+        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
+        prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
+
+    return prev_layer_output
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseCnnString(cnn_param_string_list):
+    cnn_parser = argparse.ArgumentParser(description="cnn argument parser")
+
+    cnn_parser.add_argument("--filt-x-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-y-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--filt-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--num-filters", required=True, type=int)
+    cnn_parser.add_argument("--pool-x-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-step", type=int, default = 1)
+
+    cnn_args = []
+    for cl in range(0, len(cnn_param_string_list)):
+         cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl])))
+
+    return cnn_args
+
+def ParseSpliceString(splice_indexes):
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = splice_indexes.split();  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        raise Exception("invalid splice-indexes argument, too short: "
+                 + splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                raise Exception("invalid splice-indexes argument, too-short element: "
+                         + splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                raise Exception("elements of splice-indexes must be sorted: "
+                         + splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
+def MakeConfigs(config_dir, splice_indexes_string,
+                cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
+                feat_dim, ivector_dim, num_targets, add_lda,
+                nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim,
+                use_presoftmax_prior_scale,
+                final_layer_normalize_target,
+                include_log_softmax,
+                add_final_sigmoid,
+                xent_regularize,
+                xent_separate_forward_affine,
+                self_repair_scale,
+                objective_type,
+                num_multiple_targets, bottleneck_layer, bottleneck_dim):
+
+    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
+
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
+
+    if xent_separate_forward_affine:
+        if splice_indexes[-1] != [0]:
+            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    if len(num_multiple_targets) > 1:
+      for target in range(len(num_multiple_targets)):
+        nodes.AddOutputLayer(init_config_lines, prev_layer_output, suffix = str(target)) 
+    else:
+      nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    if cnn_layer is not None:
+        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
+                                         feat_dim, splice_indexes[0], ivector_dim)
+
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+
+        if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            if xent_regularize == 0.0:
+                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
+
+            if nonlin_type == "relu" :
+                prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
+                                                                   prev_layer_output, nonlin_output_dim,
+                                                                   self_repair_scale = self_repair_scale,
+                                                                   norm_target_rms = final_layer_normalize_target)
+
+                prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
+                                                                  prev_layer_output, nonlin_output_dim,
+                                                                  self_repair_scale = self_repair_scale,
+                                                                  norm_target_rms = final_layer_normalize_target)
+            elif nonlin_type == "pnorm" :
+                prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain",
+                                                                 prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                                                                 norm_target_rms = final_layer_normalize_target)
+
+                prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent",
+                                                                prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                                                                norm_target_rms = final_layer_normalize_target)
+            else:
+                raise Exception("Unknown nonlinearity type")
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax)
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
+                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                    0.5 / xent_regularize),
+                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                prior_scale_file = prior_scale_file,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+        else:
+          if bottleneck_layer > -1 and i+1 == bottleneck_layer:
+             print('bottleneck layer and its dimension are {0} and {1} respectively.'.format(bottleneck_layer, bottleneck_dim))
+             if nonlin_type == "relu":
+                 prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_Bottleneck".format(i),
+                                                              prev_layer_output, bottleneck_dim,
+                                                              self_repair_scale = self_repair_scale,
+                                                              norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+             elif nonlin_type == "pnorm":
+                 prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_Bottleneck".format(i),
+                                                            prev_layer_output, nonlin_input_dim, bottleneck_dim,
+                                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+             else:
+                 raise Exception("Unknown nonlinearity type")
+          else:
+              if nonlin_type == "relu":
+                  prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                              prev_layer_output, nonlin_output_dim,
+                                                              self_repair_scale = self_repair_scale,
+                                                              norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+              elif nonlin_type == "pnorm":
+                  prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                             prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                                                             norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+              else:
+                  raise Exception("Unknown nonlinearity type")
+         # Add multiple pre-final affine layer and multiple softmax layer correspond 
+          # to each target language.
+          if len(num_multiple_targets) > 1:
+            for target in range(len(num_multiple_targets)):
+              nodes.AddFinalLayer(config_lines, prev_layer_output, 
+                                  num_multiple_targets[target],
+                                  name_affix = 'output-'+str(target),
+                                  use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                  prior_scale_file = prior_scale_file,
+                                  include_log_softmax = include_log_softmax,
+                                  add_final_sigmoid = add_final_sigmoid,
+                                  objective_type = objective_type)
+          else:
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+
+            # add_final_sigmoid adds a sigmoid as a final layer as alternative
+            # to log-softmax layer.
+            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
+            # This is useful when you need the final outputs to be probabilities between 0 and 1.
+            # Usually used with an objective-type such as "quadratic".
+            # Applications are k-binary classification such Ideal Ratio Mask prediction.
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax,
+                               add_final_sigmoid = add_final_sigmoid,
+                               objective_type = objective_type)
+          if xent_regularize != 0.0:
+              nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                  ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                        0.5 / xent_regularize),
+                                  use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                  prior_scale_file = prior_scale_file,
+                                  include_log_softmax = True,
+                                  name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('num_targets=' + str(num_targets), file=f)
+    print('add_lda=' + ('true' if add_lda else 'false'), file=f)
+    print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
+    print('objective_type=' + objective_type, file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
+
+def Main():
+    args = GetArgs()
+    
+    if args.num_multiple_targets is not None:
+      num_multiple_targets = args.num_multiple_targets.split()
+      print('Number of output targets is {0}'.format(len(num_multiple_targets)))
+
+    MakeConfigs(config_dir = args.config_dir,
+                splice_indexes_string = args.splice_indexes,
+                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
+                num_targets = args.num_targets,
+                add_lda = args.add_lda,
+                cnn_layer = args.cnn_layer,
+                cnn_bottleneck_dim = args.cnn_bottleneck_dim,
+                cepstral_lifter = args.cepstral_lifter,
+                nonlin_type = args.nonlin_type,
+                nonlin_input_dim = args.nonlin_input_dim,
+                nonlin_output_dim = args.nonlin_output_dim,
+                subset_dim = args.subset_dim,
+                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
+                final_layer_normalize_target = args.final_layer_normalize_target,
+                include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                xent_regularize = args.xent_regularize,
+                xent_separate_forward_affine = args.xent_separate_forward_affine,
+                self_repair_scale = args.self_repair_scale_nonlinearity,
+                objective_type = args.objective_type,
+                num_multiple_targets = num_multiple_targets, 
+                bottleneck_layer = args.bottleneck_layer,
+                bottleneck_dim = args.bottleneck_dim)
+
+if __name__ == "__main__":
+    Main()
+
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
index a43aa05176b..c154e39d7a2 100644
--- a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -24,15 +24,86 @@ def SendMail(message, subject, email_id):
         logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
         pass
 
+def IsMultilingual(egs_dir):
+    # num of langs used to generate egs is written in egs_dir/info/num_lang
+    # in multilingual setup.
+    multilingual_training = False
+    num_lang = 1
+    num_lang_str = "{0}/info/num_lang".format(egs_dir)
+    if (os.path.isfile(num_lang_str)):
+
+      num_lang = int(open(num_lang_str,'r').readline())
+      if num_lang > 1:
+        multilingual_training = True
+    return multilingual_training
+
+# Generate example string used during train and compute-prob 
+# containing shuffle, merge by considering
+# multilingual case.
+# egs_suffix is empty for egs used for nnet3-train during training and
+# is equal to "valid_diagnostic" and "train_diagnostic" during nnet3-compute-prob.
+def ExampleString(egs_dir, minibatch_size,
+                  context_opts = None, archive_index = None,
+                  iter = 1, shuffle_buffer_size = 0,
+                  egs_suffix = None, frame = None):
+    multilingual_training = IsMultilingual(egs_dir)
+    
+    frame_opt="" 
+    if frame is not None:
+      frame_opt="--frame={0}".format(frame)
+
+    # There is no example shuffle for computing diagnostics
+    shuffle_str=""
+    if shuffle_buffer_size > 0:
+      shuffle_str=" nnet3-shuffle-egs --buffer-size={0} --srand={1} ark:- ark:-|".format(shuffle_buffer_size, iter)
+
+    if multilingual_training:
+      # In multilingual setup, the examples in egs.{archive_index}.scp are written in 
+      # groups of minibatch-size w.r.t language id.
+      # We first merge examples with same language-id and then shuffle minibatchs.
+      # The outputs and weight are output.{archive_index} and weight.{archive_index} for training.
+      egs_str = ("egs" if egs_suffix is None else egs_suffix+".egs")+("."+str(archive_index)+".scp" if archive_index is not None else ".1.scp")
+
+      multilingual_opts="--weights='ark:{0}/{2}weight.{1}' --outputs='ark:{0}/{2}output.{1}'".format(egs_dir,
+                                                                                                    (1 if archive_index is None  else archive_index), 
+                                                                                                    (str(egs_suffix)+"." if egs_suffix is not None else ""))
+      
+      egs_for_train_string="ark,bg:nnet3-copy-egs {frame_opt} {context_opts} {multilingual_opts} scp:{egs_dir}/{egs_str} ark:- | nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |{shuffle_str}".format(context_opts = (context_opts if context_opts is not None else ""),
+               egs_dir = egs_dir,
+               egs_str = egs_str,
+               minibatch_size = minibatch_size,
+               multilingual_opts = multilingual_opts,
+               frame_opt = frame_opt,
+               shuffle_str = shuffle_str)       
+    else:
+      egs_str = ("egs" if egs_suffix is None else egs_suffix+".egs")+("."+str(archive_index)+".ark" if archive_index is not None else "")
+      egs_for_train_string="ark,bg:nnet3-copy-egs {frame_opt} {context_opts} ark:{egs_dir}/{egs_str} ark:- |{shuffle_str}\
+        nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false \
+        --discard-partial-minibatches=true ark:- ark:- |\
+        ".format(context_opts = (context_opts if context_opts is not None else ""),
+                 egs_dir = egs_dir, 
+                 minibatch_size = minibatch_size,
+                 egs_str = egs_str,
+                 frame_opt = frame_opt,
+                 shuffle_str = shuffle_str) 
+
+    return egs_for_train_string
+
+def StrToBool(values):
+    if values == "true":
+        return True
+    elif values == "false":
+        return False
+    else:
+        raise ValueError
+
 class StrToBoolAction(argparse.Action):
     """ A custom action to convert bools from shell format i.e., true/false
         to python format i.e., True/False """
     def __call__(self, parser, namespace, values, option_string=None):
-        if values == "true":
-            setattr(namespace, self.dest, True)
-        elif values == "false":
-            setattr(namespace, self.dest, False)
-        else:
+        try:
+            setattr(namespace, self.dest, StrToBool(values))
+        except ValueError:
             raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
 
 class NullstrToNoneAction(argparse.Action):
@@ -101,10 +172,68 @@ def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
             accepted_models.append(i+1)
 
     if len(accepted_models) != num_models:
-        logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern))
+        logger.warn("""Only {0}/{1} of the models have been accepted
+for averaging, based on log files {2}.""".format(len(accepted_models),
+                                                 num_models, log_file_pattern))
 
     return [accepted_models, max_index+1]
 
+def GetAverageNnetModel(dir, iter, nnets_list, run_opts,
+                        get_raw_nnet_from_am = True, shrink = None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    new_iter = iter + 1
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
+{dir}/{iter}.mdl {dir}/{new_iter}.mdl""".format(dir = dir, iter = iter,
+                                                new_iter = new_iter,
+                                                scale = scale)
+    else:
+        if shrink is not None:
+            out_model = """- \| nnet3-copy --scale={scale} \
+- {dir}/{new_iter}.raw""".format(dir = dir, new_iter = new_iter, scale = scale)
+        else:
+            out_model = "{dir}/{new_iter}.raw".format(dir = dir,
+                                                      new_iter = new_iter)
+
+    RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnets_list} \
+{out_model}""".format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               nnets_list = nnets_list,
+               out_model = out_model))
+
+def GetBestNnetModel(dir, iter, best_model_index, run_opts,
+                     get_raw_nnet_from_am = True, shrink = None):
+    scale = 1.0
+    if shrink is not None:
+        scale = shrink
+
+    best_model = '{dir}/{next_iter}.{best_model_index}.raw'.format(
+            dir = dir,
+            next_iter = iter + 1,
+            best_model_index = best_model_index)
+
+    if get_raw_nnet_from_am:
+        out_model = """- \| nnet3-am-copy --set-raw-nnet=- \
+{dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(dir = dir, iter = iter,
+                                                 new_iter = iter + 1)
+    else:
+        out_model = '{dir}/{next_iter}.raw'.format(dir = dir,
+                                                   next_iter = iter + 1)
+
+    RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+nnet3-copy --scale={scale} {best_model} \
+{out_model}""".format(command = run_opts.command,
+               dir = dir, iter = iter,
+               best_model =  best_model,
+               out_model = out_model, scale = scale))
+
 def GetNumberOfLeaves(alidir):
     [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
     parts = stdout.split()
@@ -120,6 +249,7 @@ def GetNumberOfJobs(alidir):
     except IOError, ValueError:
         raise Exception('Exception while reading the number of alignment jobs')
     return num_jobs
+
 def GetIvectorDim(ivector_dir = None):
     if ivector_dir is None:
         return 0
@@ -132,6 +262,11 @@ def GetFeatDim(feat_dir):
     feat_dim = int(stdout_val)
     return feat_dim
 
+def GetFeatDimFromScp(feat_scp):
+    [stdout_val, stderr_val] =  RunKaldiCommand("feat-to-dim --print-args=false scp:{feat_scp} -".format(feat_scp = feat_scp))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
 def ReadKaldiMatrix(matrix_file):
     try:
         lines = map(lambda x: x.split(), open(matrix_file).readlines())
@@ -205,6 +340,28 @@ def ParseModelConfigVarsFile(var_file):
 
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
+def ParseGenericConfigVarsFile(var_file):
+    variables = {}
+    try:
+        var_file_handle = open(var_file, 'r')
+        for line in var_file_handle:
+            parts = line.split('=')
+            field_name = parts[0].strip()
+            field_value = parts[1].strip()
+            if field_name in ['model_left_context', 'left_context']:
+                variables['model_left_context'] = int(field_value)
+            elif field_name in ['model_right_context', 'right_context']:
+                variables['model_right_context'] = int(field_value)
+            elif field_name == 'num_hidden_layers':
+                variables['num_hidden_layers'] = int(field_value)
+            else:
+                variables[field_name] = field_value
+        return variables
+    except ValueError:
+        # we will throw an error at the end of the function so I will just pass
+        pass
+
+    raise Exception('Error while parsing the file {0}'.format(var_file))
 
 def GenerateEgs(data, alidir, egs_dir,
                 left_context, right_context,
@@ -242,6 +399,72 @@ def GenerateEgs(data, alidir, egs_dir,
           egs_dir = egs_dir,
           egs_opts = egs_opts if egs_opts is not None else '' ))
 
+def GenerateMultilingualEgs(egs_dirs, run_opts, minibatch_size = 512,
+                            samples_per_iter = 40000,
+                            egs_opts = None, stage = 0):
+  multi_egs_dir = egs_dirs.split()
+
+  RunKaldiCommand("""
+steps/nnet3/multilingual/get_egs.sh {egs_opts} \
+  --cmd "{command}" --stage {stage} \
+  --minibatch-size {minibatch_size} \
+  --samples-per-iter {samples_per_iter} \
+  {num_langs} {egs_dirs}
+  """.format(command = run_opts.command, 
+             samples_per_iter = samples_per_iter,
+             minibatch_size = minibatch_size,
+             num_langs = len(multi_egs_dir) - 1,
+             stage = stage,
+             egs_opts = egs_opts if egs_opts is not None else '',
+             egs_dirs = egs_dirs))
+
+def GenerateEgsFromTargets(data, targets_scp, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                target_type = 'dense', num_targets = -1,
+                samples_per_iter = 20000, frames_per_eg = 20, srand = 0,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+    if target_type == 'dense':
+        num_targets = GetFeatDimFromScp(targets_scp)
+    else:
+        if num_targets == -1:
+            raise Exception("--num-targets is required if target-type is dense")
+
+    RunKaldiCommand("""
+steps/nnet3/get_egs_targets.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  --srand {srand} \
+  --target-type {target_type} \
+  --num-targets {num_targets} \
+  {data} {targets_scp} {egs_dir}
+      """.format(command = run_opts.egs_command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, srand = srand,
+          num_targets = num_targets,
+          data = data,
+          targets_scp = targets_scp, target_type = target_type,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
 def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
@@ -316,7 +539,7 @@ def ForceSymlink(file1, file2):
             os.symlink(file1, file2)
 
 def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
-                                presoftmax_prior_scale_power = None):
+                                presoftmax_prior_scale_power = -0.25):
 
     # getting the raw pdf count
     RunKaldiCommand("""
@@ -336,9 +559,14 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
     import glob
     for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
         os.remove(file)
-
-    smooth=0.01
     pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+    scaled_counts = SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = presoftmax_prior_scale_power, smooth = 0.01)
+
+    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
+    WriteKaldiMatrix(output_file, [scaled_counts])
+    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
+
+def SmoothPresoftmaxPriorScaleVector(pdf_counts, presoftmax_prior_scale_power = -0.25, smooth = 0.01):
     total = sum(pdf_counts)
     average_count = total/len(pdf_counts)
     scales = []
@@ -346,20 +574,15 @@ def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
         scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
     scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+    return scaled_counts
 
-    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
-    WriteKaldiMatrix(output_file, [scaled_counts])
-    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
 
 def PrepareInitialAcousticModel(dir, alidir, run_opts):
     """ Adds the first layer; this will also add in the lda.mat and
         presoftmax_prior_scale.vec. It will also prepare the acoustic model
         with the transition model."""
 
-    RunKaldiCommand("""
-{command} {dir}/log/add_first_layer.log \
-   nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
-               dir = dir))
+    PrepareInitialNetwork(dir, run_opts)
 
   # Convert to .mdl, train the transitions, set the priors.
     RunKaldiCommand("""
@@ -369,6 +592,12 @@ def PrepareInitialAcousticModel(dir, alidir, run_opts):
         """.format(command = run_opts.command,
                    dir = dir, alidir = alidir))
 
+def PrepareInitialNetwork(dir, run_opts):
+    RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
+               dir = dir))
+
 def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
                      num_archives, max_models_combine, add_layers_period,
                      num_jobs_final):
@@ -478,13 +707,17 @@ def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
 
     return num_jobs * effective_learning_rate
 
-def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
+def DoShrinkage(iter, model_file, name, non_linearity, shrink_threshold,
+                get_raw_nnet_from_am = True):
 
     if iter == 0:
         return True
 
     try:
-        output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
+        if get_raw_nnet_from_am:
+            output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
+        else:
+            output, error = RunKaldiCommand("nnet3-info --print-args=false {model_file} | grep '{name}' | grep {non_linearity}".format(name = name, non_linearity = non_linearity, model_file = model_file))
         output = output.strip().split("\n")
         # eg.
         # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
@@ -506,41 +739,66 @@ def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
 
     return False
 
-def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256, wait = False):
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=256,
+                                wait = False, get_raw_nnet_from_am = True,
+                                compute_accuracy = True):
 
-    model = '{0}/{1}.mdl'.format(dir, iter)
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir = dir, iter = iter)
+    else:
+        model = "{dir}/{iter}.raw".format(dir = dir, iter = iter)
+
+    compute_prob_opts = "--compute-accuracy" if compute_accuracy else "";
 
+    compute_prob_opts = "--compute-accuracy" if compute_accuracy else "";
+    
+    valid_egs_for_compute_prob_str = ExampleString(egs_dir, mb_size,
+                                                   context_opts = None,
+                                                   egs_suffix = "valid_diagnostic")
+
+
+    train_egs_for_compute_prob_str = ExampleString(egs_dir, mb_size,
+                                                   context_opts = None,
+                                                   egs_suffix = "train_diagnostic")
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_valid.{iter}.log \
-  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-        "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
+        "{egs_string}"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
-               egs_dir = egs_dir), wait = wait)
+               compute_prob_opts = compute_prob_opts,
+               egs_dir = egs_dir,
+               egs_string = valid_egs_for_compute_prob_str), wait = wait)
 
     RunKaldiCommand("""
 {command} {dir}/log/compute_prob_train.{iter}.log \
-  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
-       "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:- |"
+  nnet3-compute-prob {compute_prob_opts} "{model}" \
+       "{egs_string}"
     """.format(command = run_opts.command,
                dir = dir,
                iter = iter,
                mb_size = mb_size,
                model = model,
-               egs_dir = egs_dir), wait = wait)
-
-
-def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
+               compute_prob_opts = compute_prob_opts,
+               egs_dir = egs_dir,
+               egs_string = train_egs_for_compute_prob_str), wait = wait)
+
+def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False,
+                    get_raw_nnet_from_am = True):
+    if get_raw_nnet_from_am:
+        prev_model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter - 1)
+        model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format(dir, iter)
+    else:
+        prev_model = '{0}/{1}.raw'.format(dir, iter - 1)
+        model = '{0}/{1}.raw'.format(dir, iter)
 
-    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
-    model = '{0}/{1}.mdl'.format(dir, iter)
     RunKaldiCommand("""
 {command} {dir}/log/progress.{iter}.log \
-nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
-nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
+nnet3-info {model} '&&' \
+nnet3-show-progress --use-gpu=no {prev_model} {model} \
 "ark,bg:nnet3-merge-egs --minibatch-size={mb_size} ark:{egs_dir}/train_diagnostic.egs ark:-|"
     """.format(command = run_opts.command,
                dir = dir,
@@ -551,7 +809,8 @@ def ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=256, wait=False):
                egs_dir = egs_dir), wait = wait)
 
 def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
-                  run_opts, chunk_width = None):
+                  run_opts, chunk_width = None,
+                  get_raw_nnet_from_am = True, compute_accuracy = True):
     # Now do combination.  In the nnet3 setup, the logic
     # for doing averaging of subsets of the models in the case where
     # there are too many models to reliably esetimate interpolation
@@ -559,10 +818,16 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     raw_model_strings = []
     print num_iters_combine
     for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
-      model_file = '{0}/{1}.mdl'.format(dir, iter)
-      if not os.path.exists(model_file):
-          raise Exception('Model file {0} missing'.format(model_file))
-      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      if get_raw_nnet_from_am:
+          model_file = '{0}/{1}.mdl'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+      else:
+          model_file = '{0}/{1}.raw'.format(dir, iter)
+          if not os.path.exists(model_file):
+              raise Exception('Model file {0} missing'.format(model_file))
+          raw_model_strings.append(model_file)
 
     if chunk_width is not None:
         # this is an RNN model
@@ -570,26 +835,37 @@ def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
     else:
         mbsize = 1024
 
+    if get_raw_nnet_from_am:
+        out_model = "|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl".format(dir = dir, num_iters = num_iters)
+    else:
+        out_model = '{dir}/final.raw'.format(dir = dir)
+
     RunKaldiCommand("""
 {command} {combine_queue_opt} {dir}/log/combine.log \
 nnet3-combine --num-iters=40 \
    --enforce-sum-to-one=true --enforce-positive-weights=true \
    --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
-"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
-    """.format(command = run_opts.command,
+   {out_model}
+   """.format(command = run_opts.command,
                combine_queue_opt = run_opts.combine_queue_opt,
                dir = dir, raw_models = " ".join(raw_model_strings),
                mbsize = mbsize,
-               num_iters = num_iters,
+               out_model = out_model,
                egs_dir = egs_dir))
 
-  # Compute the probability of the final, combined model with
-  # the same subset we used for the previous compute_probs, as the
-  # different subsets will lead to different probs.
-    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    # Compute the probability of the final, combined model with
+    # the same subset we used for the previous compute_probs, as the
+    # different subsets will lead to different probs.
+    if get_raw_nnet_from_am:
+        ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+    else:
+        ComputeTrainCvProbabilities(dir, 'final', egs_dir, run_opts,
+                                    wait = False, get_raw_nnet_from_am = False,
+                                    compute_accuracy = compute_accuracy)
 
 def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
-                            prior_subset_size, run_opts):
+                            prior_subset_size, run_opts,
+                            get_raw_nnet_from_am = True):
     # Note: this just uses CPUs, using a smallish subset of data.
     """ Computes the average posterior of the network"""
     import glob
@@ -601,15 +877,20 @@ def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
     else:
         egs_part = 'JOB'
 
+    if get_raw_nnet_from_am:
+        model = "nnet3-am-copy --raw=true {dir}/combined.mdl -|".format(dir = dir)
+    else:
+        model = "{dir}/final.raw".format(dir = dir)
+
     RunKaldiCommand("""
 {command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
     nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
     nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
-  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+    {model} ark:- ark:- \| \
 matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
     """.format(command = run_opts.command,
-               dir = dir,
+               dir = dir, model = model,
                num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
                prior_queue_opt = run_opts.prior_queue_opt,
                iter = iter, prior_subset_size = prior_subset_size,
@@ -643,25 +924,32 @@ def RemoveEgs(egs_dir):
 
 def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
                  preserve_model_interval = 100,
-                 remove_egs = True):
+                 remove_egs = True,
+                 get_raw_nnet_from_am = True):
     try:
         if remove_egs:
             RemoveEgs(egs_dir)
 
         for iter in range(num_iters):
             RemoveModel(nnet_dir, iter, num_iters, 1,
-                        preserve_model_interval)
+                        preserve_model_interval,
+                        get_raw_nnet_from_am = get_raw_nnet_from_am)
     except (IOError, OSError) as err:
         logger.warning("Error while cleaning up the nnet directory")
         raise err
 
 def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
-               preserve_model_interval = 100):
+               preserve_model_interval = 100,
+               get_raw_nnet_from_am = True):
     if iter % preserve_model_interval == 0:
         return
     if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
         return
-    file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    if get_raw_nnet_from_am:
+        file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    else:
+        file_name = '{0}/{1}.raw'.format(nnet_dir, iter)
+
     if os.path.isfile(file_name):
         os.remove(file_name)
 
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index ea8f41749da..f1c489f4ca0 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -47,7 +47,7 @@ def GetArgs():
 """)
     parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
     parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
-    parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start")
+    parser.add_argument("--objective-type", type=str, default="linear", choices=["linear","quadratic","chain"], help="Objective function used during training -- determines which plots are to be plotted.");
     parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
     parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
 
@@ -422,7 +422,7 @@ def GenerateParameterDiffPlots(exp_dir, output_dir, plot, comparison_dir = None,
             if latex_report is not None:
                 latex_report.AddFigure(figfile_name, "Parameter differences at {0}".format(component_name))
 
-def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False):
+def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, objective_type = "linear"):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -435,15 +435,18 @@ def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is
     else:
         latex_report = None
 
-    if is_chain:
+    if objective_type == "chain":
         logger.info("Generating log-probability plots")
         GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
-    else:
+    elif objective_type == "linear":
         logger.info("Generating accuracy plots")
         GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
         logger.info("Generating log-likelihood plots")
         GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    else:
+        logger.info("Generating " + objective_type + " objective plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'objective', file_basename = 'objective', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
 
     logger.info("Generating non-linearity stats plots")
     GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
@@ -465,7 +468,7 @@ def Main():
     GeneratePlots(args.exp_dir, args.output_dir,
                   comparison_dir = args.comparison_dir,
                   start_iter = args.start_iter,
-                  is_chain = args.is_chain)
+                  objective_type = args.objective_type)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index bac260e93bc..d79be683cac 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -43,6 +43,8 @@ def GetArgs():
                                   help="alignment directory, from which we derive the num-targets")
     num_target_group.add_argument("--tree-dir", type=str,
                                   help="directory with final.mdl, from which we derive the num-targets")
+    num_target_group.add_argument("--num-multiple-targets", type=str,
+                        help="space separated number of network targets for different languages(e.g. num-pdf-ids/num-leaves e.g. '1000 2000 3000')")
 
     # CNN options
     parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
@@ -54,10 +56,6 @@ def GetArgs():
                         help="Output dimension of the linear layer at the CNN output "
                         "for dimension reduction, e.g. 256."
                         "The default zero means this layer is not needed.", default=0)
-    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
-                        help="The factor used for determining the liftering vector in the production of MFCC. "
-                        "User has to ensure that it matches the lifter used in MFCC generation, "
-                        "e.g. 22.0", default=22.0)
 
     # General neural network options
     parser.add_argument("--splice-indexes", type=str, required = True,
@@ -69,6 +67,8 @@ def GetArgs():
                         help="If \"true\" an LDA matrix computed from the input features "
                         "(spliced according to the first set of splice-indexes) will be used as "
                         "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "This variable needs to be set to \"false\" when using dense-targets "
+                        "or when --add-idct is set to \"true\"."
                         "If --cnn.layer is specified this option will be forced to \"false\".",
                         default=True, choices = ["false", "true"])
 
@@ -116,9 +116,26 @@ def GetArgs():
     parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
                         help="if true, a presoftmax-prior-scale is added",
                         choices=['true', 'false'], default = True)
+
+    # Options to convert input MFCC into Fbank features. This is useful when a
+    # LDA layer is not added (such as when using dense targets)
+    parser.add_argument("--cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+
+    parser.add_argument("--add-idct", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="Add an IDCT after input to convert MFCC to Fbank", default = False)
     parser.add_argument("config_dir",
                         help="Directory to write config files and variables")
-
+    # multilingual tdnn with bn layer config
+    parser.add_argument("--bottleneck-layer", type=int,
+                        help="The layer number to add bottleneck layer,"
+                        "if < 0, means this layer is not needed in network.",
+                        default=-1)
+    parser.add_argument("--bottleneck-dim", type=int,
+                        help="The bottleneck layer dimension in TDNN network e.g. 42.",
+                        default=40)
     print(' '.join(sys.argv))
 
     args = parser.parse_args()
@@ -145,9 +162,13 @@ def CheckArgs(args):
     if not args.feat_dim > 0:
         raise Exception("feat-dim has to be postive")
 
+    if args.add_lda and args.add_idct:
+        raise Exception("add-idct can be true only if add-lda is false")
+
     if not args.num_targets > 0:
-        print(args.num_targets)
-        raise Exception("num_targets has to be positive")
+        if args.num_multiple_targets is None: 
+            print(args.num_targets)
+            raise Exception("num_targets or num_multiple_targets has to be positive")
 
     if not args.ivector_dim >= 0:
         raise Exception("ivector-dim has to be non-negative")
@@ -323,7 +344,7 @@ def ParseSpliceString(splice_indexes):
 # The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
 def MakeConfigs(config_dir, splice_indexes_string,
                 cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
-                feat_dim, ivector_dim, num_targets, add_lda,
+                feat_dim, ivector_dim, num_targets, add_lda, add_idct,
                 nonlin_type, nonlin_input_dim, nonlin_output_dim, subset_dim,
                 nonlin_output_dim_init, nonlin_output_dim_final,
                 use_presoftmax_prior_scale,
@@ -333,7 +354,8 @@ def MakeConfigs(config_dir, splice_indexes_string,
                 xent_regularize,
                 xent_separate_forward_affine,
                 self_repair_scale,
-                objective_type):
+                objective_type,
+                num_multiple_targets, bottleneck_layer, bottleneck_dim):
 
     parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
 
@@ -351,8 +373,14 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     config_lines = {'components':[], 'component-nodes':[]}
 
+    if add_idct and cnn_layer is None:
+        # If CNN layer is not None, IDCT will be add inside AddCnnLayers method
+        nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+
     config_files={}
-    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0],
+                        ivector_dim,
+                        idct_mat = config_dir.strip() + "/idct.mat" if (add_idct and cnn_layer is None) else None)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
@@ -365,6 +393,9 @@ def MakeConfigs(config_dir, splice_indexes_string,
         prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
                                          feat_dim, splice_indexes[0], ivector_dim)
 
+    # add_lda needs to be set "false" when using dense targets,
+    # or if the task is not a simple classification task
+    # (e.g. regression, multi-task)
     if add_lda:
         prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
 
@@ -387,7 +418,15 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     for i in range(0, num_hidden_layers):
         # make the intermediate config file for layerwise discriminative training
-
+        bnf_suffix=""
+        if bottleneck_layer > -1 and i+1 == bottleneck_layer:
+            print('bottleneck layer and its dimension are {0} and {1} respectively.'.format(bottleneck_layer, bottleneck_dim))
+            nonlin_output_layer_dim = bottleneck_dim
+            bnf_suffix = "_Bottleneck"
+        elif nonlin_type == "relu":
+            nonlin_output_layer_dim = nonlin_output_dims[i] 
+        elif nonlin_type == "pnorm":
+            nonlin_output_layer_dim = nonlin_output_dim
         # prepare the spliced input
         if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
             try:
@@ -425,21 +464,21 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
             if nonlin_type == "relu" :
                 prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
-                                                                   prev_layer_output, nonlin_output_dim,
+                                                                   prev_layer_output, nonlin_output_layer_dim,
                                                                    self_repair_scale = self_repair_scale,
                                                                    norm_target_rms = final_layer_normalize_target)
 
                 prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
-                                                                  prev_layer_output, nonlin_output_dim,
+                                                                  prev_layer_output, nonlin_output_layer_dim,
                                                                   self_repair_scale = self_repair_scale,
                                                                   norm_target_rms = final_layer_normalize_target)
             elif nonlin_type == "pnorm" :
                 prev_layer_output_chain = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_chain",
-                                                                 prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                                                                 prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim,
                                                                  norm_target_rms = final_layer_normalize_target)
 
                 prev_layer_output_xent = nodes.AddAffPnormLayer(config_lines, "Tdnn_pre_final_xent",
-                                                                prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                                                                prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim,
                                                                 norm_target_rms = final_layer_normalize_target)
             else:
                 raise Exception("Unknown nonlinearity type")
@@ -458,39 +497,59 @@ def MakeConfigs(config_dir, splice_indexes_string,
                                 name_affix = 'xent')
         else:
             if nonlin_type == "relu":
-                prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
-                                                            prev_layer_output, nonlin_output_dims[i],
+                prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn{1}_{0}".format(i, bnf_suffix),
+                                                            prev_layer_output, nonlin_output_layer_dim,
                                                             self_repair_scale = self_repair_scale,
                                                             norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
             elif nonlin_type == "pnorm":
-                prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn_{0}".format(i),
-                                                           prev_layer_output, nonlin_input_dim, nonlin_output_dim,
+                prev_layer_output = nodes.AddAffPnormLayer(config_lines, "Tdnn{1}_{0}".format(i, bnf_suffix),
+                                                           prev_layer_output, nonlin_input_dim, nonlin_output_layer_dim,
                                                            norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
             else:
                 raise Exception("Unknown nonlinearity type")
-            # a final layer is added after each new layer as we are generating
-            # configs for layer-wise discriminative training
-
-            # add_final_sigmoid adds a sigmoid as a final layer as alternative
-            # to log-softmax layer.
-            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
-            # This is useful when you need the final outputs to be probabilities between 0 and 1.
-            # Usually used with an objective-type such as "quadratic".
-            # Applications are k-binary classification such Ideal Ratio Mask prediction.
-            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
-                               prior_scale_file = prior_scale_file,
-                               include_log_softmax = include_log_softmax,
-                               add_final_sigmoid = add_final_sigmoid,
-                               objective_type = objective_type)
-            if xent_regularize != 0.0:
-                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
-                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
-                                          0.5 / xent_regularize),
+
+            if len(num_multiple_targets) > 1:
+              for target in range(len(num_multiple_targets)):
+                nodes.AddFinalLayer(config_lines, prev_layer_output, 
+                                    num_multiple_targets[target],
+                                    name_affix = str(target),
                                     use_presoftmax_prior_scale = use_presoftmax_prior_scale,
                                     prior_scale_file = prior_scale_file,
-                                    include_log_softmax = True,
-                                    name_affix = 'xent')
+                                    include_log_softmax = include_log_softmax,
+                                    add_final_sigmoid = add_final_sigmoid,
+                                    objective_type = objective_type)
+                if xent_regularize != 0.0:
+                    nodes.AddFinalLayer(config_lines, prev_layer_output, num_multiple_targets[target],
+                                        ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                              0.5 / xent_regularize),
+                                        use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                        prior_scale_file = prior_scale_file,
+                                        include_log_softmax = True,
+                                        name_affix = 'xent-output-'+str(target))
+            else:
+                # a final layer is added after each new layer as we are generating
+                # configs for layer-wise discriminative training
+
+                # add_final_sigmoid adds a sigmoid as a final layer as alternative
+                # to log-softmax layer.
+                # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
+                # This is useful when you need the final outputs to be probabilities between 0 and 1.
+                # Usually used with an objective-type such as "quadratic".
+                # Applications are k-binary classification such Ideal Ratio Mask prediction.
+                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                   use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                   prior_scale_file = prior_scale_file,
+                                   include_log_softmax = include_log_softmax,
+                                   add_final_sigmoid = add_final_sigmoid,
+                                   objective_type = objective_type)
+                if xent_regularize != 0.0:
+                    nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                        ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                              0.5 / xent_regularize),
+                                        use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                        prior_scale_file = prior_scale_file,
+                                        include_log_softmax = True,
+                                        name_affix = 'xent')
 
         config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
@@ -516,12 +575,15 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
 def Main():
     args = GetArgs()
+    if args.num_multiple_targets is not None:
+      num_multiple_targets = args.num_multiple_targets.split()
+      print('Number of output targets is {0}'.format(len(num_multiple_targets)))
 
     MakeConfigs(config_dir = args.config_dir,
                 splice_indexes_string = args.splice_indexes,
                 feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
                 num_targets = args.num_targets,
-                add_lda = args.add_lda,
+                add_lda = args.add_lda, add_idct = args.add_idct,
                 cnn_layer = args.cnn_layer,
                 cnn_bottleneck_dim = args.cnn_bottleneck_dim,
                 cepstral_lifter = args.cepstral_lifter,
@@ -538,7 +600,10 @@ def Main():
                 xent_regularize = args.xent_regularize,
                 xent_separate_forward_affine = args.xent_separate_forward_affine,
                 self_repair_scale = args.self_repair_scale_nonlinearity,
-                objective_type = args.objective_type)
+                objective_type = args.objective_type,
+                num_multiple_targets = num_multiple_targets,
+                bottleneck_layer = args.bottleneck_layer,
+                bottleneck_dim = args.bottleneck_dim)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index a3764b88492..fc137a87d62 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -2,10 +2,11 @@
 
 
 # Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
 # Apache 2.0.
 
 
-# this script is based on steps/nnet3/lstm/train.sh
+# this script is based on steps/nnet3/tdnn/train.sh
 
 
 import subprocess
@@ -17,7 +18,8 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -35,170 +37,27 @@ def GetArgs():
     Trains a feed forward DNN acoustic model using the cross-entropy objective.
     DNNs include simple DNNs, TDNNs and CNNs.
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
-
-    # egs extraction options
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
+
+    train_lib.AddCommonTrainArgs(parser)
+
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
                         default = 8,
                         help="Number of output labels per example")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help="Controls randomization of the samples on each"
-                        "iteration. If 0 or a large value the randomization is"
-                        "complete, but this will consume memory and cause spikes"
-                        "in disk I/O.  Smaller is easier on disk and memory but"
-                        "less random.  It's not a huge deal though, as samples"
-                        "are anyway randomized right at the start."
-                        "(the point of this is to get data in different"
-                        "minibatches on different iterations, since in the"
-                        "preconditioning method, 2 samples in the same minibatch"
-                        "can affect each others' gradients.")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers"
-                        "during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="The maximum change in parameters allowed per minibatch,"
-                        "measured in Frobenius norm over the entire model")
-    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
-                        default=400000,
-                        help="This is really the number of egs in each archive.")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
-                        default=-0.25,
-                        help="")
 
-    # Realignment parameters
-    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
-                        default=None, action=NullstrToNoneAction,
-                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
-    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
-                        default=30,
-                        help="Number of jobs to use for realignment")
-    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
-                        default=None, action=NullstrToNoneAction,
-                        help="""A space seperated string of realignment
-                        times. Values must be between 0 and 1
-                        e.g. '0.1 0.2 0.3' """)
-
-    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
-                        default=True, action=StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="If true, gpu is used with steps/nnet3/align.sh")
-
-    # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
                         default = 512,
                         help="Size of the minibatch used to compute the gradient")
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
-    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
-                        default = 0.0,
-                        help="""Momentum used in update computation.
-                        Note: we implemented it in such a way that
-                        it doesn't increase the effective learning rate.""")
-    # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
+                        default=-0.25,
+                        help="")
 
+    # General options
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
-                        help="Languade directory")
+                        help="Language directory")
     parser.add_argument("--ali-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -223,8 +82,9 @@ def ProcessArgs(args):
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
+
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -248,197 +108,12 @@ def ProcessArgs(args):
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
 
-    if args.realign_use_gpu is True:
-        run_opts.realign_use_gpu = True
-        run_opts.realign_queue_opt = "--gpu 1"
-    else:
-        run_opts.realign_use_gpu = False
-        run_opts.realign_queue_opt = ""
-
-    if args.realign_command is None:
-        run_opts.realign_command = args.command
-    else:
-        run_opts.realign_command = args.realign_command
-    run_opts.realign_num_jobs = args.realign_num_jobs
-
     run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-        self.realign_use_gpu = None
-
-# this is the main method which differs between RNN and DNN training
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, minibatch_size,
-                   run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-        frame = (k / num_archives) % frames_per_eg
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  "{raw_model}" \
-  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     frame = frame,
-                     momentum = momentum, max_param_change = max_param_change,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     minibatch_size = minibatch_size),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, minibatch_size,
-                      frames_per_eg, num_hidden_layers, add_layers_period,
-                      left_context, right_context,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      run_opts):
-
-
-
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts)
-
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file )
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-
-    if do_average:
-      cur_minibatch_size = minibatch_size
-      cur_max_param_change = max_param_change
-    else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_minibatch_size = minibatch_size // 2
-      cur_max_param_change = float(max_param_change) / math.sqrt(2)
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir, frames_per_eg,
-                   left_context, right_context,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_minibatch_size,
-                   run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -461,7 +136,17 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        left_context = variables['model_left_context']
+        right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -545,15 +230,6 @@ def Train(args, run_opts):
                                                                     num_archives_to_process,
                                                                     args.initial_effective_lrate,
                                                                     args.final_effective_lrate)
-    realign_iters = []
-    if args.realign_times is not None:
-        realign_iters = GetRealignIters(args.realign_times,
-                                        num_iters,
-                                        args.num_jobs_initial,
-                                        args.num_jobs_final)
-        print(realign_iters)
-    # egs_dir will be updated if there is realignment
-    cur_egs_dir=egs_dir
 
     logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
     for iter in range(num_iters):
@@ -563,29 +239,28 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            if iter in realign_iters:
-                logger.info("Re-aligning the data at iteration {0}".format(iter))
-                prev_egs_dir=cur_egs_dir
-                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
-                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
-                Realign(args.dir, iter, args.feat_dir, args.lang,
-                        prev_egs_dir, cur_egs_dir,
-                        args.prior_subset_size, num_archives, run_opts,
-                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
-                if args.cleanup and args.egs_dir is None:
-                    RemoveEgs(prev_egs_dir)
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
 
             logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
 
-            TrainOneIteration(args.dir, iter, args.srand, egs_dir, current_num_jobs,
-                              num_archives_processed, num_archives,
-                              learning_rate(iter, current_num_jobs, num_archives_processed),
-                              args.minibatch_size, args.frames_per_eg,
-                              num_hidden_layers, args.add_layers_period,
-                              left_context, right_context,
-                              args.momentum, args.max_param_change,
-                              args.shuffle_buffer_size, run_opts)
+            train_lib.TrainOneIteration(dir = args.dir,
+                                        iter = iter,
+                                        srand = args.srand,
+                                        egs_dir = egs_dir,
+                                        num_jobs = current_num_jobs,
+                                        num_archives_processed = num_archives_processed,
+                                        num_archives = num_archives,
+                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        minibatch_size = args.minibatch_size,
+                                        frames_per_eg = args.frames_per_eg,
+                                        num_hidden_layers = num_hidden_layers,
+                                        add_layers_period = args.add_layers_period,
+                                        left_context = left_context,
+                                        right_context = right_context,
+                                        momentum = args.momentum,
+                                        max_param_change = args.max_param_change,
+                                        shuffle_buffer_size = args.shuffle_buffer_size,
+                                        run_opts = run_opts)
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
                 RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
@@ -598,7 +273,7 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
@@ -624,7 +299,7 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+        CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs)
 
@@ -646,7 +321,7 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
new file mode 100755
index 00000000000..f3a11cfcc94
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/tdnn/train_raw_nnet.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+import os.path
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting raw DNN trainer (train_raw_dnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains a feed forward raw DNN (without transition model)
+    using the cross-entropy objective.
+    DNNs include simple DNNs, TDNNs and CNNs.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
+
+    train_lib.AddCommonTrainArgs(parser)
+
+    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
+                        default = 8,
+                        help="Number of output labels per example")
+
+    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
+                        default = 512,
+                        help="Size of the minibatch used to compute the gradient")
+
+    # General options
+    parser.add_argument("--nj", type=int, default=4,
+                        help="Number of parallel jobs")
+
+    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+                       default = True, choices = ["true", "false"],
+                       help="Train neural network using dense targets")
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--targets-scp", type=str, 
+                        help="Target for training neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.frames_per_eg < 1:
+        raise Exception("--egs.frames-per-eg should have a minimum value of 1")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    # set the options corresponding to args.use_gpu
+    run_opts = train_lib.RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    SplitData(args.feat_dir, args.nj)
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    variables = ParseGenericConfigVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        left_context = variables['model_left_context']
+        right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+        if variables['num_targets'] != 'None':
+          num_targets = int(variables['num_targets'])
+        add_lda = StrToBool(variables['add_lda'])
+        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        objective_type = variables['objective_type']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if args.use_dense_targets:
+        if GetFeatDimFromScp(targets_scp) != num_targets:
+            raise Exception("Mismatch between num-targets provided to "
+                            "script vs configs")
+
+    if (args.stage <= -5):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if args.use_dense_targets:
+        target_type = "dense"
+        compute_accuracy = False
+    else:
+        target_type = "sparse"
+        compute_accuracy = True if objective_type == "linear" else False
+
+    # If num of egs dirs in args.egs_dir > 1,
+    # it is correspond to multilingual training and 
+    # it should generate egs dir for each languages.
+    # The last dir corresponds to multilingual egs directory,
+    # that is generated using this script, but 
+    # it requires single egs dirs to exist.
+    multi_egs_dir = args.egs_dir.split()
+    if len(multi_egs_dir) == 1:
+      default_egs_dir = '{0}/egs'.format(args.dir)
+      if (args.stage <= -4) and args.egs_dir is None:
+          logger.info("Generating egs")
+
+          GenerateEgsFromTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                      left_context, right_context,
+                      left_context, right_context, run_opts,
+                      frames_per_eg = args.frames_per_eg,
+                      egs_opts = args.egs_opts,
+                      cmvn_opts = args.cmvn_opts,
+                      online_ivector_dir = args.online_ivector_dir,
+                      samples_per_iter = args.samples_per_iter,
+                      transform_dir = args.transform_dir,
+                      stage = args.egs_stage,
+                      target_type = target_type,
+                      num_targets = num_targets)
+      
+      if args.egs_dir is None:
+          egs_dir = default_egs_dir
+      else:
+          egs_dir = args.egs_dir
+    else:
+      egs_dir = multi_egs_dir[-1]
+      #if (args.stage <= -4) and not os.path.exists(egs_dir):
+      if (args.stage <= -4):
+        logger.info("Generating multilingual egs dir")
+        GenerateMultilingualEgs(args.egs_dir, run_opts,
+                                stage = args.egs_stage,
+                                samples_per_iter = args.samples_per_iter,
+                                egs_opts = args.egs_opts)
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.frames_per_eg == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (add_lda and args.stage <= -3):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial network.")
+        PrepareInitialNetwork(args.dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_expanded = num_archives * args.frames_per_eg
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives_expanded,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+
+            train_lib.TrainOneIteration(dir = args.dir,
+                                        iter = iter,
+                                        srand = args.srand,
+                                        egs_dir = egs_dir,
+                                        num_jobs = current_num_jobs,
+                                        num_archives_processed = num_archives_processed,
+                                        num_archives = num_archives,
+                                        learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                                        minibatch_size = args.minibatch_size,
+                                        frames_per_eg = args.frames_per_eg,
+                                        num_hidden_layers = num_hidden_layers,
+                                        add_layers_period = args.add_layers_period,
+                                        left_context = left_context,
+                                        right_context = right_context,
+                                        momentum = args.momentum,
+                                        max_param_change = args.max_param_change,
+                                        shuffle_buffer_size = args.shuffle_buffer_size,
+                                        run_opts = run_opts,
+                                        compute_accuracy = compute_accuracy,
+                                        get_raw_nnet_from_am = False)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval, get_raw_nnet_from_am = False)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    SendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                      get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
+
+    if include_log_softmax and args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs,
+                     get_raw_nnet_from_am = False)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            SendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
new file mode 100755
index 00000000000..5842e63474e
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python
+
+# Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
+# Apache 2.0.
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_raw_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains an RNN neural network using the cross-entropy objective.
+    RNNs include LSTMs, BLSTMs and GRUs.
+    RNN acoustic model training differs from feed-forward DNN training
+    in the following ways
+        1. RNN acoustic models train on output chunks rather than individual
+           outputs
+        2. The training includes additional stage of shrinkage, where
+           the parameters of the model are scaled when the derivative averages
+           at the non-linearities are below a threshold.
+        3. RNNs can also be trained with state preservation training
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
+
+    train_lib.AddCommonTrainArgs(parser)
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 20,
+                        help="""Number of output labels in the sequence
+                        used to train an LSTM.
+                        Caution: if you double this you should halve
+                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 40,
+                        help="""Number of left steps used in the estimation of LSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="""Number of right steps used in the estimation of BLSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=20000,
+                        help="""This is really the number of egs in each
+                        archive.  Each eg has 'chunk_width' frames in it--
+                        for chunk_width=20, this value (20k) is equivalent
+                        to the 400k number that we use as a default in
+                        regular DNN training.""")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.5,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 0.99,
+                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int, dest='cv_minibatch_size',
+            default = 256,
+            help="Size of the minibatch to be used in diagnostic jobs (use smaller value for BLSTMs to control memory usage)")
+
+
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=100,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
+                        default=None,
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+
+    # General options
+    parser.add_argument("--nj", type=int, default=4,
+                        help="Number of parallel jobs")
+
+    parser.add_argument("--use-dense-targets", type=str, action=StrToBoolAction,
+                       default = True, choices = ["true", "false"],
+                       help="Train neural network using dense targets")
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--targets-scp", type=str, required = True,
+                        help="Target for training neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be positive")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be positive")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    # set the options corresponding to args.use_gpu
+    run_opts = train_lib.RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    SplitData(args.feat_dir, args.nj)
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    variables = ParseGenericConfigVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+        num_targets = int(variables['num_targets'])
+        add_lda = StrToBool(variables['add_lda'])
+        include_log_softmax = StrToBool(variables['include_log_softmax'])
+        objective_type = variables['objective_type']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if args.use_dense_targets:
+        if GetFeatDimFromScp(args.targets_scp) != num_targets:
+            raise Exception("Mismatch between num-targets provided to "
+                            "script vs configs")
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if args.use_dense_targets:
+        target_type = "dense"
+        compute_accuracy = False
+    else:
+        target_type = "sparse"
+        compute_accuracy = True if objective_type == "linear" else False
+
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgsUsingTargets(args.feat_dir, args.targets_scp, default_egs_dir,
+                                left_context, right_context,
+                                args.chunk_width + left_context,
+                                args.chunk_width + right_context, run_opts,
+                                frames_per_eg = args.chunk_width,
+                                srand = args.srand,
+                                egs_opts = args.egs_opts,
+                                cmvn_opts = args.cmvn_opts,
+                                online_ivector_dir = args.online_ivector_dir,
+                                samples_per_iter = args.samples_per_iter,
+                                transform_dir = args.transform_dir,
+                                stage = args.egs_stage,
+                                target_type = target_type,
+                                num_targets = num_targets)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (add_lda and args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialNetwork(args.dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    if args.num_bptt_steps is None:
+        num_bptt_steps = args.chunk_width
+    else:
+        num_bptt_steps = args.num_bptt_steps
+
+    min_deriv_time = args.chunk_width - num_bptt_steps
+
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            model_file = "{dir}/{iter}.raw".format(dir = args.dir, iter = iter)
+            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "Lstm*", "SigmoidComponent", args.shrink_threshold, get_raw_nnet_from_am = False) else 1
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            rnn_train_lib.TrainOneIteration(
+                          dir = args.dir,
+                          iter = iter,
+                          srand = args.srand,
+                          egs_dir = egs_dir,
+                          num_jobs = current_num_jobs,
+                          num_archives_processed = num_archives_processed,
+                          num_archives = num_archives,
+                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                          shrinkage_value = shrinkage_value,
+                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                          num_hidden_layers = num_hidden_layers,
+                          add_layers_period = args.add_layers_period,
+                          left_context = left_context,
+                          right_context = right_context,
+                          min_deriv_time = min_deriv_time,
+                          momentum = args.momentum,
+                          max_param_change = args.max_param_change,
+                          shuffle_buffer_size = args.shuffle_buffer_size,
+                          cv_minibatch_size = args.cv_minibatch_size,
+                          run_opts = run_opts,
+                          compute_accuracy = compute_accuracy,
+                          get_raw_nnet_from_am = False)
+
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval, get_raw_nnet_from_am = False)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    SendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.raw")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                chunk_width = args.chunk_width, get_raw_nnet_from_am = False, compute_accuracy = compute_accuracy)
+
+    if include_log_softmax and args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purpose of using as priors to convert posteriors into likelihoods.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'final', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts, get_raw_nnet_from_am = False)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs,
+                     get_raw_nnet_from_am = False)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            SendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 3763fb26303..a5679800db6 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -2,6 +2,7 @@
 
 
 # Copyright 2016 Vijayaditya Peddinti.
+#           2016 Vimal Manohar
 # Apache 2.0.
 
 
@@ -17,7 +18,9 @@
 import traceback
 from nnet3_train_lib import *
 
-nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+rnn_train_lib = imp.load_source('rtl', 'steps/nnet3/libs/rnn_train_lib.py')
+train_lib = imp.load_source('tl', 'steps/nnet3/libs/train_lib.py')
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -43,16 +46,10 @@ def GetArgs():
            at the non-linearities are below a threshold.
         3. RNNs can also be trained with state preservation training
     """,
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    conflict_handler = 'resolve')
 
-    # feat options
-    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""directory with the ivectors extracted in
-                        an online fashion.""")
-    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="A string specifying '--norm-means' and '--norm-vars' values")
+    train_lib.AddCommonTrainArgs(parser)
 
     # egs extraction options
     parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
@@ -69,58 +66,6 @@ def GetArgs():
                         default = 0,
                         help="""Number of right steps used in the estimation of BLSTM
                         state before prediction of the first label""")
-    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
-                        default = None, action = NullstrToNoneAction,
-                        help="""Directory with egs. If specified this directory
-                        will be used rather than extracting egs""")
-    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
-                        default = 0, help="Stage at which get_egs.sh should be restarted")
-    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
-                        default = None, action = NullstrToNoneAction,
-                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
-
-    # trainer options
-    parser.add_argument("--trainer.srand", type=int, dest='srand',
-                        default = 0,
-                        help="Sets the random seed for model initialization and egs shuffling. "
-                        "Warning: This random seed does not control all aspects of this experiment. "
-                        "There might be other random seeds used in other stages of the experiment "
-                        "like data preparation (e.g. volume perturbation).")
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default = 8,
-                        help="Number of epochs to train the model")
-    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
-                        default = 20000,
-                        help="Number of samples for computing priors")
-    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
-                        default = 10,
-                        help="The prior computation jobs are single threaded and run on the CPU")
-    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
-    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
-                        default = 5000,
-                        help=""" Controls randomization of the samples on each
-                        iteration. If 0 or a large value the randomization is
-                        complete, but this will consume memory and cause spikes
-                        in disk I/O.  Smaller is easier on disk and memory but
-                        less random.  It's not a huge deal though, as samples
-                        are anyway randomized right at the start.
-                        (the point of this is to get data in different
-                        minibatches on different iterations, since in the
-                        preconditioning method, 2 samples in the same minibatch
-                        can affect each others' gradients.""")
-    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
-                        default=2,
-                        help="The number of iterations between adding layers during layer-wise discriminative training.")
-    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
-                        default=2.0,
-                        help="""The maximum change in parameters allowed
-                        per minibatch, measured in Frobenius norm over
-                        the entire model""")
     parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
                         default=20000,
                         help="""This is really the number of egs in each
@@ -128,49 +73,8 @@ def GetArgs():
                         for chunk_width=20, this value (20k) is equivalent
                         to the 400k number that we use as a default in
                         regular DNN training.""")
-    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
-                        default=4.0,
-                        help="""Value used in preconditioning matrix estimation""")
-    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
-                        default=10,
-                        help="""Max number of jobs used for LDA stats accumulation""")
-
-    # Realignment parameters
-    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
-                        default=None, action=NullstrToNoneAction,
-                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
-    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
-                        default=30,
-                        help="Number of jobs to use for realignment")
-    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
-                        default=None, action=NullstrToNoneAction,
-                        help="""A space seperated string of realignment
-                        times. Values must be between 0 and 1
-                        e.g. '0.1 0.2 0.3' """)
-
-    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
-                        default=True, action=StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="If true, gpu is used with steps/nnet3/align.sh")
 
     # Parameters for the optimization
-    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
-                        default = 0.0003,
-                        help="Learning rate used during the initial iteration")
-    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
-                        default = 0.00003,
-                        help="Learning rate used during the final iteration")
-    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
-                        default = 1,
-                        help="Number of neural net jobs to run in parallel at the start of training")
-    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
-                        default = 8,
-                        help="Number of neural net jobs to run in parallel at the end of training")
-    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
-                        default = 20,
-                        help = """ The is the maximum number of models we give to the
-                                   final 'combine' stage, but these models will themselves
-                                   be averages of iteration-number ranges. """)
     parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
                         default = 0.5,
                         help="""Momentum used in update computation.
@@ -197,45 +101,10 @@ def GetArgs():
                         help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
 
     # General options
-    parser.add_argument("--stage", type=int, default=-4,
-                        help="Specifies the stage of the experiment to execution from")
-    parser.add_argument("--exit-stage", type=int, default=None,
-                        help="If specified, training exits before running this stage")
-    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
-                        dest = "command",
-                        help="""Specifies the script to launch jobs.
-                        e.g. queue.pl for launching on SGE cluster
-                             run.pl for launching on local machine
-                        """, default = "queue.pl")
-    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Use GPU for training", default=True)
-    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="Clean up models after training", default=True)
-    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
-                        default = True, action = StrToBoolAction,
-                        choices = ["true", "false"],
-                        help="""If true, remove egs after experiment""")
-    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
-                        type=int, default=100,
-                        help="Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 model will be preserved.")
-
-    parser.add_argument("--reporting.email", dest = "email",
-                        type=str, default=None, action = NullstrToNoneAction,
-                        help=""" Email-id to report about the progress of the experiment.
-                              NOTE: It assumes the machine on which the script is being run can send
-                              emails from command line via. mail program. The
-                              Kaldi mailing list will not support this feature.
-                              It might require local expertise to setup. """)
-    parser.add_argument("--reporting.interval", dest = "reporting_interval",
-                        type=int, default=0.1,
-                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
-
     parser.add_argument("--feat-dir", type=str, required = True,
                         help="Directory with features used for training the neural network.")
     parser.add_argument("--lang", type=str, required = True,
-                        help="Languade directory")
+                        help="Language directory")
     parser.add_argument("--ali-dir", type=str, required = True,
                         help="Directory with alignments used for training the neural network.")
     parser.add_argument("--dir", type=str, required = True,
@@ -266,8 +135,9 @@ def ProcessArgs(args):
 
     if args.transform_dir is None:
         args.transform_dir = args.ali_dir
+
     # set the options corresponding to args.use_gpu
-    run_opts = RunOpts()
+    run_opts = train_lib.RunOpts()
     if args.use_gpu:
         if not CheckIfCudaCompiled():
             logger.warning("""
@@ -291,231 +161,12 @@ def ProcessArgs(args):
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
 
-    if args.realign_use_gpu is True:
-        run_opts.realign_use_gpu = True
-        run_opts.realign_queue_opt = "--gpu 1"
-    else:
-        run_opts.realign_use_gpu = False
-        run_opts.realign_queue_opt = ""
-
-    if args.realign_command is None:
-        run_opts.realign_command = args.command
-    else:
-        run_opts.realign_command = args.realign_command
-    run_opts.realign_num_jobs = args.realign_num_jobs
-
     run_opts.command = args.command
+    run_opts.egs_command = args.egs_command if args.egs_command is not None else args.command
     run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
 
     return [args, run_opts]
 
-class StrToBoolAction(argparse.Action):
-    """ A custom action to convert bools from shell format i.e., true/false
-        to python format i.e., True/False """
-    def __call__(self, parser, namespace, values, option_string=None):
-        if values == "true":
-            setattr(namespace, self.dest, True)
-        elif values == "false":
-            setattr(namespace, self.dest, False)
-        else:
-            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
-
-class NullstrToNoneAction(argparse.Action):
-    """ A custom action to convert empty strings passed by shell
-        to None in python. This is necessary as shell scripts print null strings
-        when a variable is not specified. We could use the more apt None
-        in python. """
-    def __call__(self, parser, namespace, values, option_string=None):
-            if values.strip() == "":
-                setattr(namespace, self.dest, None)
-            else:
-                setattr(namespace, self.dest, values)
-
-
-# a class to store run options
-class RunOpts:
-    def __init__(self):
-        self.command = None
-        self.train_queue_opt = None
-        self.combine_queue_opt = None
-        self.prior_gpu_opt = None
-        self.prior_queue_opt = None
-        self.parallel_train_opts = None
-        self.realign_use_gpu = None
-
-
-def TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, num_chunk_per_minibatch,
-                   cache_read_opt, run_opts):
-      # We cannot easily use a single parallel SGE job to do the main training,
-      # because the computation of which archive and which --frame option
-      # to use for each job is a little complex, so we spawn each one separately.
-      # this is no longer true for RNNs as we use do not use the --frame option
-      # but we use the same script for consistency with FF-DNN code
-
-    context_opts="--left-context={0} --right-context={1}".format(
-                  left_context, right_context)
-    processes = []
-    for job in range(1,num_jobs+1):
-        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
-                                               # the other indexes from.
-        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
-
-        cache_write_opt = ""
-        if job == 1:
-          # an option for writing cache (storing pairs of nnet-computations and
-          # computation-requests) during training.
-          cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
-
-        process_handle = RunKaldiCommand("""
-{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
-  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
-  --print-interval=10 --momentum={momentum} \
-  --max-param-change={max_param_change} \
-  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
-  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={srand} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
-  {dir}/{next_iter}.{job}.raw
-          """.format(command = run_opts.command,
-                     train_queue_opt = run_opts.train_queue_opt,
-                     dir = dir, iter = iter, srand = iter + srand, next_iter = iter + 1, job = job,
-                     parallel_train_opts = run_opts.parallel_train_opts,
-                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
-                     momentum = momentum, max_param_change = max_param_change,
-                     min_deriv_time = min_deriv_time,
-                     raw_model = raw_model_string, context_opts = context_opts,
-                     egs_dir = egs_dir, archive_index = archive_index,
-                     shuffle_buffer_size = shuffle_buffer_size,
-                     num_chunk_per_minibatch = num_chunk_per_minibatch),
-          wait = False)
-
-        processes.append(process_handle)
-
-    all_success = True
-    for process in processes:
-        process.wait()
-        [stdout_value, stderr_value] = process.communicate()
-        print(stderr_value)
-        if process.returncode != 0:
-            all_success = False
-
-    if not all_success:
-        open('{0}/.error'.format(dir), 'w').close()
-        raise Exception("There was error during training iteration {0}".format(iter))
-
-def TrainOneIteration(dir, iter, srand, egs_dir,
-                      num_jobs, num_archives_processed, num_archives,
-                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
-                      num_hidden_layers, add_layers_period,
-                      left_context, right_context, min_deriv_time,
-                      momentum, max_param_change, shuffle_buffer_size,
-                      cv_minibatch_size, run_opts):
-    # Set off jobs doing some diagnostics, in the background.
-    # Use the egs dir from the previous iteration for the diagnostics
-    logger.info("Training neural net (pass {0})".format(iter))
-
-    # check if different iterations use the same random seed
-    if os.path.exists('{0}/srand'.format(dir)):
-        try:
-            saved_srand = int(open('{0}/srand'.format(dir), 'r').readline().strip())
-        except IOError, ValueError:
-            raise Exception('Exception while reading the random seed for training')
-        if srand != saved_srand:
-            logger.warning("The random seed provided to this iteration (srand={0}) is different from the one saved last time (srand={1}). Using srand={0}.".format(srand, saved_srand))
-    else:
-        f = open('{0}/srand'.format(dir), 'w')
-        f.write(str(srand))
-        f.close()
-
-
-    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
-
-    if iter > 0:
-        ComputeProgress(dir, iter, egs_dir, run_opts, mb_size=cv_minibatch_size)
-
-    # an option for writing cache (storing pairs of nnet-computations
-    # and computation-requests) during training.
-    cache_read_opt = ""
-    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
-        do_average = False # if we've just mixed up, don't do averaging but take the
-                           # best.
-        cur_num_hidden_layers = 1 + iter / add_layers_period
-        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={srand} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, srand=iter + srand, config=config_file)
-    else:
-        do_average = True
-        if iter == 0:
-            do_average = False   # on iteration 0, pick the best, don't average.
-        else:
-            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
-        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
-
-    if do_average:
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
-    else:
-      # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size (and we will later choose the output of just one of the jobs): the
-      # model-averaging isn't always helpful when the model is changing too fast
-      # (i.e. it can worsen the objective function), and the smaller minibatch
-      # size will help to keep the update stable.
-      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
-
-    try:
-        os.remove("{0}/.error".format(dir))
-    except OSError:
-        pass
-
-    TrainNewModels(dir, iter, srand, num_jobs, num_archives_processed, num_archives,
-                   raw_model_string, egs_dir,
-                   left_context, right_context, min_deriv_time,
-                   momentum, max_param_change,
-                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
-                   cache_read_opt, run_opts)
-    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
-    nnets_list = []
-    for n in models_to_average:
-      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
-
-    if do_average:
-        # average the output of the different jobs.
-        RunKaldiCommand("""
-{command} {dir}/log/average.{iter}.log \
-nnet3-average {nnet_list} - \| \
-nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir,
-                   iter = iter,
-                   nnet_list = " ".join(nnets_list),
-                   shrink = shrinkage_value,
-                   new_iter = iter + 1))
-
-    else:
-        # choose the best model from different jobs
-        RunKaldiCommand("""
-{command} {dir}/log/select.{iter}.log \
-    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
-        """.format(command = run_opts.command,
-                   dir = dir, iter = iter, next_iter = iter + 1,
-                   shrink = shrinkage_value, best_model_index =  best_model))
-
-    try:
-        for i in range(1, num_jobs + 1):
-            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
-    except OSError:
-        raise Exception("Error while trying to delete the raw models")
-
-    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
-
-    if not os.path.isfile(new_model):
-        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
-    elif os.stat(new_model).st_size == 0:
-        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
-    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
-        os.remove("{0}/cache.{1}".format(dir, iter))
-
-
 # args is a Namespace with the required parameters
 def Train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
@@ -538,7 +189,21 @@ def Train(args, run_opts):
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
 
-    [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    variables = ParseGenericConfigVarsFile(var_file)
+
+    # Set some variables.
+
+    try:
+        model_left_context = variables['model_left_context']
+        model_right_context = variables['model_right_context']
+        num_hidden_layers = variables['num_hidden_layers']
+    except KeyError as e:
+        raise Exception("KeyError {0}: Variables need to be defined in {1}".format(
+            str(e), '{0}/configs'.format(args.dir)))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
@@ -552,9 +217,6 @@ def Train(args, run_opts):
     """.format(command = run_opts.command,
                dir = args.dir))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
-
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
         logger.info("Generating egs")
@@ -616,15 +278,6 @@ def Train(args, run_opts):
                                                                     num_archives_to_process,
                                                                     args.initial_effective_lrate,
                                                                     args.final_effective_lrate)
-    realign_iters = []
-    if args.realign_times is not None:
-        realign_iters = GetRealignIters(args.realign_times,
-                                        num_iters,
-                                        args.num_jobs_initial,
-                                        args.num_jobs_final)
-        print(realign_iters)
-    # egs_dir will be updated if there is realignment
-    cur_egs_dir=egs_dir
 
     if args.num_bptt_steps is None:
         num_bptt_steps = args.chunk_width
@@ -642,41 +295,31 @@ def Train(args, run_opts):
         current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
 
         if args.stage <= iter:
-            if iter in realign_iters:
-                logger.info("Re-aligning the data at iteration {0}".format(iter))
-                prev_egs_dir=cur_egs_dir
-                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
-                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
-                Realign(args.dir, iter, args.feat_dir, args.lang,
-                        prev_egs_dir, cur_egs_dir,
-                        args.prior_subset_size, num_archives, run_opts,
-                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
-                if args.cleanup and args.egs_dir is None:
-                    RemoveEgs(prev_egs_dir)
             model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
             shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
             logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
 
-            TrainOneIteration(dir = args.dir,
-                              iter = iter,
-                              srand = args.srand,
-                              egs_dir = egs_dir,
-                              num_jobs = current_num_jobs,
-                              num_archives_processed = num_archives_processed,
-                              num_archives = num_archives,
-                              learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
-                              shrinkage_value = shrinkage_value,
-                              num_chunk_per_minibatch = args.num_chunk_per_minibatch,
-                              num_hidden_layers = num_hidden_layers,
-                              add_layers_period = args.add_layers_period,
-                              left_context = left_context,
-                              right_context = right_context,
-                              min_deriv_time = min_deriv_time,
-                              momentum = args.momentum,
-                              max_param_change= args.max_param_change,
-                              shuffle_buffer_size = args.shuffle_buffer_size,
-                              cv_minibatch_size = args.cv_minibatch_size,
-                              run_opts = run_opts)
+            rnn_train_lib.TrainOneIteration(
+                          dir = args.dir,
+                          iter = iter,
+                          srand = args.srand,
+                          egs_dir = egs_dir,
+                          num_jobs = current_num_jobs,
+                          num_archives_processed = num_archives_processed,
+                          num_archives = num_archives,
+                          learning_rate = learning_rate(iter, current_num_jobs, num_archives_processed),
+                          shrinkage_value = shrinkage_value,
+                          num_chunk_per_minibatch = args.num_chunk_per_minibatch,
+                          num_hidden_layers = num_hidden_layers,
+                          add_layers_period = args.add_layers_period,
+                          left_context = left_context,
+                          right_context = right_context,
+                          min_deriv_time = min_deriv_time,
+                          momentum = args.momentum,
+                          max_param_change = args.max_param_change,
+                          shuffle_buffer_size = args.shuffle_buffer_size,
+                          cv_minibatch_size = args.cv_minibatch_size,
+                          run_opts = run_opts)
 
             if args.cleanup:
                 # do a clean up everythin but the last 2 models, under certain conditions
@@ -690,7 +333,7 @@ def Train(args, run_opts):
                     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
                     message = report
                     subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
-                    sendMail(message, subject, args.email)
+                    SendMail(message, subject, args.email)
 
         num_archives_processed = num_archives_processed + current_num_jobs
 
@@ -717,21 +360,20 @@ def Train(args, run_opts):
             # delete it
             remove_egs = False
 
-        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+        CleanNnetDir(args.dir, num_iters, egs_dir,
                      preserve_model_interval = args.preserve_model_interval,
                      remove_egs = remove_egs)
 
     # do some reporting
     [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
     if args.email is not None:
-        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
 
     report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
     report_handle.write(report)
     report_handle.close()
 
-    os.system("steps/info/nnet3_dir_info.sh " + args.dir)
-
+    os.system("steps/info/nnet3_dir_info.pl " + args.dir)
 
 def Main():
     [args, run_opts] = GetArgs()
@@ -740,19 +382,9 @@ def Main():
     except Exception as e:
         if args.email is not None:
             message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
-            sendMail(message, message, args.email)
+            SendMail(message, message, args.email)
         traceback.print_exc()
         raise e
 
-def SendMail(message, subject, email_id):
-    try:
-        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
-            message = message,
-            subject = subject,
-            email = email_id), shell=True)
-    except Exception as e:
-        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
-        pass
-
 if __name__ == "__main__":
     Main()
diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc
index 52719cc4669..44ebfee01e0 100644
--- a/src/base/kaldi-math-test.cc
+++ b/src/base/kaldi-math-test.cc
@@ -128,6 +128,41 @@ void UnitTestRand() {
         KALDI_ASSERT(tot > (n * p * 0.8) && tot < (n * p * 1.2));
       }
     }
+    { // test-1 RandIntDiscreteDist().
+      int32 n = 10000, m = 10;
+      std::vector<BaseFloat> p(m, 0.0);
+      BaseFloat sum = 0.0;
+      // generate discrete probability distribution
+      for (int32 i = 0; i < m; i++) {
+        p[i] = RandUniform();
+        if (RandInt(0,5) == 0) p[i] = 0;
+        sum += p[i];
+      }
+      for (int32 i = 0; i < m; i++)
+        p[i] /= sum;
+
+      std::vector<int32> rand_seq(n,0);
+      std::vector<BaseFloat> empirical_dist(m,0); 
+      for (int32 i = 0; i < n; i++) {
+        rand_seq[i] = RandIntDiscreteDist(p);
+        // compute empirical distribution of generated sequence.
+        empirical_dist[rand_seq[i]] += 1.0/n;
+      }
+      
+      BaseFloat tmp = 0.0, kl_div = 0.0;
+      for (int32 i = 0; i < m; i++) {
+        if (p[i] < 0.0000001) {
+          KALDI_ASSERT(empirical_dist[i] <= 0.001);
+          KALDI_LOG << " p and q for i = " << i << " is " << p[i] << ", " << empirical_dist[i];
+        } else {
+          if (empirical_dist[i] > 0.0) {
+              tmp = p[i]/empirical_dist[i];
+              kl_div += p[i] * log(p[i]/empirical_dist[i]);
+          }
+        }
+      }
+      KALDI_ASSERT(kl_div < 0.001);
+    }
     {  // test RandInt().
       KALDI_ASSERT(RandInt(0, 3) >= 0 && RandInt(0, 3) <= 3);
 
diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc
index 40300331731..c095d12e42e 100644
--- a/src/base/kaldi-math.cc
+++ b/src/base/kaldi-math.cc
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #endif
 #include <string>
+#include <numeric>
 
 namespace kaldi {
 // These routines are tested in matrix/matrix-test.cc
@@ -74,6 +75,26 @@ RandomState::RandomState() {
   seed = Rand() + 27437;
 }
 
+int32 RandIntDiscreteDist(const std::vector<BaseFloat> &prob, struct RandomState* state) {
+  BaseFloat prob_sum = std::accumulate(prob.begin(), prob.end(), 0.0);
+  KALDI_ASSERT(prob_sum <= 1.1 && prob_sum >= 0.99); // probability distribution sum should be one.
+  std::vector<BaseFloat> scaled_prob(prob);
+  int32 prob_size = prob.size();
+  for (int32 i = 0; i < prob_size; i++)
+    scaled_prob[i] *= 1.0 / prob_sum;
+  std::vector<BaseFloat> cdf(prob_size); // cumulative probability distribution.
+  cdf[0] = scaled_prob[0];
+  // if cdf(i) < random number < cdf(i+1), it returns i.
+  for (int32 i = 1; i < prob_size; i++) 
+    cdf[i] = cdf[i-1] + scaled_prob[i];
+  BaseFloat rand_num =  RandUniform(state);
+  if (rand_num > 1.0) rand_num = 1.0;
+  std::vector<BaseFloat>::iterator low = std::lower_bound(cdf.begin(), cdf.end(), rand_num);
+  int32 ans = low - cdf.begin();
+  KALDI_ASSERT(ans >=0 && ans < prob_size);
+  return ans; 
+}
+
 bool WithProb(BaseFloat prob, struct RandomState* state) {
   KALDI_ASSERT(prob >= 0 && prob <= 1.1);  // prob should be <= 1.0,
   // but we allow slightly larger values that could arise from roundoff in
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index ac590a06a25..9020b0300d1 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -141,6 +141,12 @@ struct RandomState {
 // Returns a random integer between min and max inclusive.
 int32 RandInt(int32 min, int32 max, struct RandomState* state = NULL);
 
+// Returns a random integer number according to a discrete probability distribution.
+// It works based on sampling from a discrete distribution and 
+// it returns i with prob(i).
+// prob must sume to one.
+int32 RandIntDiscreteDist(const std::vector<BaseFloat> &prob, struct RandomState* = NULL);
+
 // Returns true with probability "prob",
 bool WithProb(BaseFloat prob, struct RandomState* state = NULL);
 // with 0 <= prob <= 1 [we check this].
diff --git a/src/matrix/compressed-matrix.cc b/src/matrix/compressed-matrix.cc
index 2ac2c544bc8..f7953bc407a 100644
--- a/src/matrix/compressed-matrix.cc
+++ b/src/matrix/compressed-matrix.cc
@@ -36,7 +36,17 @@ MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) {
         2 * header.num_rows * header.num_cols;
   }
 }
-
+// scale all element of matrix by scaling floats 
+// in GlobalHeader with alpha.
+void CompressedMatrix::Scale(float alpha) {
+  if (data_ != NULL) {
+    GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
+    // scale the floating point values in each PerColHolder
+    // and leave all integers the same.
+    h->min_value *= alpha;
+    h->range *= alpha;
+  }
+}
 
 template<typename Real>
 void CompressedMatrix::CopyFromMat(
diff --git a/src/matrix/compressed-matrix.h b/src/matrix/compressed-matrix.h
index 603134ab800..4853b31b5e0 100644
--- a/src/matrix/compressed-matrix.h
+++ b/src/matrix/compressed-matrix.h
@@ -114,6 +114,10 @@ class CompressedMatrix {
 
   void Clear();
   
+  /// scales all elements of matrix by alpha.
+  /// It scales the floating point values in GlobalHeader by alpha. 
+  void Scale(float alpha);
+
   friend class Matrix<float>;
   friend class Matrix<double>;
  private:
@@ -163,7 +167,7 @@ class CompressedMatrix {
   static inline float CharToFloat(float p0, float p25,
                                   float p75, float p100,
                                   unsigned char value);
-  
+   
   void *data_; // first GlobalHeader, then PerColHeader (repeated), then
   // the byte data for each column (repeated).  Note: don't intersperse
   // the byte data with the PerColHeaders, because of alignment issues.
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 687ac66ac46..b2269590d80 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -4227,7 +4227,22 @@ template<typename Real> static void UnitTestCompressedMatrix() {
         }
       }
     }
-
+    { // Check Scale() method for compressedMatrix.
+      for (int32 t = 0; t < 10; t++) {
+        float alpha = 0.1;
+        MatrixIndexT num_rows = 4 + Rand() % 20,
+          num_cols = 10 + Rand() % 50;
+        Matrix<Real> M(num_rows, num_cols);
+        M.SetRandn();
+        CompressedMatrix cmat(M);
+        Matrix<Real> scaled_comp_mat(num_rows, num_cols),
+          scaled_mat(M);
+        scaled_mat.Scale(alpha);
+        cmat.Scale(alpha);
+        cmat.CopyToMat(&scaled_comp_mat);
+        scaled_comp_mat.ApproxEqual(scaled_mat, 1.0e-04);
+      }
+    }
     if (n < 5) {  // test I/O.
       bool binary = (n % 2 == 1);
       {
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 477d36f190a..77741d4cd09 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -52,6 +52,12 @@ Real SparseVector<Real>::Sum() const {
   return sum;
 }
 
+template<typename Real>
+void SparseVector<Real>::Scale(Real alpha) {
+  for (int32 i = 0; i < pairs_.size(); ++i) 
+    pairs_[i].second *= alpha;
+}
+
 template <typename Real>
 template <typename OtherReal>
 void SparseVector<Real>::CopyElementsToVec(VectorBase<OtherReal> *vec) const {
@@ -606,6 +612,13 @@ void SparseMatrix<Real>::AppendSparseMatrixRows(
   inputs->clear();
 }
 
+template<typename Real>
+void SparseMatrix<Real>::Scale(Real alpha) {
+  MatrixIndexT num_rows = rows_.size();
+  for (MatrixIndexT row = 0; row < num_rows; row++)   
+    rows_[row].Scale(alpha);
+}
+
 template<typename Real>
 Real TraceMatSmat(const MatrixBase<Real> &A,
                   const SparseMatrix<Real> &B,
@@ -746,6 +759,16 @@ void GeneralMatrix::CopyToMat(MatrixBase<BaseFloat> *mat,
   }
 }
 
+void GeneralMatrix::Scale(BaseFloat alpha) {
+  if (mat_.NumRows() !=0) {
+    mat_.Scale(alpha);
+  } else if (cmat_.NumRows() != 0) {
+    cmat_.Scale(alpha);
+  } else if (smat_.NumRows() != 0) {
+    smat_.Scale(alpha);
+  }
+ 
+}
 const SparseMatrix<BaseFloat>& GeneralMatrix::GetSparseMatrix() const {
   if (mat_.NumRows() != 0 || cmat_.NumRows() != 0)
     KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type.";
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index 9f9362542e1..25ea83acb50 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -102,6 +102,9 @@ class SparseVector {
   void Write(std::ostream &os, bool binary) const;
 
   void Read(std::istream &os, bool binary);
+  
+  /// Scale all elements of sparse vector.
+  void Scale(Real alpha);
 
  private:
   MatrixIndexT dim_;
@@ -195,6 +198,9 @@ class SparseMatrix {
   /// kUndefined behaves the same as kSetZero.
   void Resize(MatrixIndexT rows, MatrixIndexT cols,
               MatrixResizeType resize_type = kSetZero);
+  
+  /// Scale all elements in sparse matrix.
+  void Scale(Real alpha);
 
   // Use the Matrix::CopyFromSmat() function to copy from this to Matrix.  Also
   // see Matrix::AddSmat().  There is not very extensive functionality for
@@ -283,6 +289,9 @@ class GeneralMatrix {
   /// Implemented in ../cudamatrix/cu-sparse-matrix.cc
   void AddToMat(BaseFloat alpha, CuMatrixBase<BaseFloat> *cu_mat,
                 MatrixTransposeType trans = kNoTrans) const;
+  
+  /// scale each element of matrix with a scalar value.
+  void Scale(BaseFloat alpha);
 
   /// Assignment from regular matrix.
   GeneralMatrix &operator= (const MatrixBase<BaseFloat> &mat);
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index 1df7cd1e78e..eb5e03702f6 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -101,6 +101,7 @@ struct NnetExample {
   /// Caution: this operator == is not very efficient.  It's only used in
   /// testing code.
   bool operator == (const NnetExample &other) const { return io == other.io; }
+
 };
 
 
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 16e8333d5b1..83cc50c468a 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -186,7 +186,7 @@ class Nnet {
 
   /// returns index associated with this node name, or -1 if no such index.
   int32 GetNodeIndex(const std::string &node_name) const;
-
+  
   /// returns index associated with this component name, or -1 if no such index.
   int32 GetComponentIndex(const std::string &node_name) const;
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 955e200d072..c9495f076db 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -56,10 +56,9 @@ bool IsSimpleNnet(const Nnet &nnet) {
   // "input" and everything checks out.
   if (NumInputNodes(nnet) == 1)
     return true;
-  // Otherwise, there should be 2 inputs and one
+  // Otherwise, there should be input node with name input and one
   // should be called "ivector".
-  return NumInputNodes(nnet) == 2 &&
-      nnet.GetNodeIndex("ivector") != -1 &&
+  return nnet.GetNodeIndex("ivector") != -1 &&
       nnet.IsInputNode(nnet.GetNodeIndex("ivector"));
 }
 
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 9606bd5d5b7..41009189773 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -177,7 +177,6 @@ std::string NnetInfo(const Nnet &nnet);
 /// This function sets the dropout proportion in all dropout component to 
 /// dropout_proportion value.
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
-
 /// This function finds a list of components that are never used, and outputs
 /// the integer comopnent indexes (you can use these to index
 /// nnet.GetComponentNames() to get their names).
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index efb51f51910..746ec83dd1c 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
 //                2014  Vimal Manohar
+//                2016  Pegah Ghahremani
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -27,6 +28,34 @@
 namespace kaldi {
 namespace nnet3 {
 
+// rename io-name of eg w.r.t io_names list e.g. input/input-1,output/output-1
+// 'input' is renamed to input-1 and 'output' renamed to output-1.
+void RenameIoNames(const std::string &io_names,
+                   NnetExample *eg_modified) {
+  std::vector<std::string> separated_io_names;
+  SplitStringToVector(io_names, ",", true, &separated_io_names);
+  int32 num_modified_io = separated_io_names.size(),
+   io_size = eg_modified->io.size();
+  std::vector<std::string> orig_io_list;
+  for (int32 io_ind = 0; io_ind < io_size; io_ind++)
+    orig_io_list.push_back(eg_modified->io[io_ind].name);
+  
+  for (int32 ind = 0; ind < num_modified_io; ind++) {
+    std::vector<std::string> rename_io_name;
+    SplitStringToVector(separated_io_names[ind], "/", true, &rename_io_name);
+    // find the io in eg with specific name and rename it to new name.
+
+    int32 rename_io_ind = 
+       std::find(orig_io_list.begin(), orig_io_list.end(), rename_io_name[0]) - 
+        orig_io_list.begin();
+
+    if (rename_io_ind >= io_size)
+      KALDI_ERR << "No io-node with name " << rename_io_name[0]
+                << "exists in eg.";
+    eg_modified->io[rename_io_ind].name = rename_io_name[1];            
+  }
+}
+
 // returns an integer randomly drawn with expected value "expected_count"
 // (will be either floor(expected_count) or ceil(expected_count)).
 int32 GetCount(double expected_count) {
@@ -278,7 +307,9 @@ int main(int argc, char *argv[]) {
 
     // you can set frame to a number to select a single frame with a particular
     // offset, or to 'random' to select a random single frame.
-    std::string frame_str;
+    std::string frame_str,
+      weight_str = "",
+      output_str = "";
 
     ParseOptions po(usage);
     po.Register("random", &random, "If true, will write frames to output "
@@ -301,6 +332,16 @@ int main(int argc, char *argv[]) {
                 "feature left-context that we output.");
     po.Register("right-context", &right_context, "Can be used to truncate the "
                 "feature right-context that we output.");
+    po.Register("weights", &weight_str,
+                "Rspecifier maps the output posterior to each example" 
+                "If provided, the supervision weight for output is scaled."
+                " Scaling supervision weight is the same as scaling to the derivative during training "
+                " in case of linear objective."
+                "The default is one, which means we are not applying per-example weights.");
+    po.Register("outputs", &output_str,
+                "Rspecifier maps example old output-name to new output-name in example."
+                " If provided, the NnetIo with name 'output' in each example "
+                " is renamed to new output name.");
 
 
     po.Read(argc, argv);
@@ -315,6 +356,8 @@ int main(int argc, char *argv[]) {
     std::string examples_rspecifier = po.GetArg(1);
 
     SequentialNnetExampleReader example_reader(examples_rspecifier);
+    RandomAccessTokenReader output_reader(output_str);
+    RandomAccessBaseFloatReader egs_weight_reader(weight_str);
 
     int32 num_outputs = po.NumArgs() - 1;
     std::vector<NnetExampleWriter*> example_writers(num_outputs);
@@ -322,7 +365,7 @@ int main(int argc, char *argv[]) {
       example_writers[i] = new NnetExampleWriter(po.GetArg(i+2));
 
 
-    int64 num_read = 0, num_written = 0;
+    int64 num_read = 0, num_written = 0, num_err = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
@@ -332,12 +375,59 @@ int main(int argc, char *argv[]) {
         int32 index = (random ? Rand() : num_written) % num_outputs;
         if (frame_str == "" && left_context == -1 && right_context == -1 &&
             frame_shift == 0) {
-          example_writers[index]->Write(key, eg);
+          NnetExample eg_modified = eg;
+          if (!weight_str.empty()) {
+            // scale the supervision weight for egs
+            if (!egs_weight_reader.HasKey(key)) {
+              KALDI_WARN << "No weight for example key " << key;
+              num_err++;
+              continue;
+            }
+            BaseFloat weight = egs_weight_reader.Value(key);
+            for (int32 i = 0; i < eg_modified.io.size(); i++) 
+              if (eg_modified.io[i].name == "output") 
+                eg_modified.io[i].features.Scale(weight);
+          }
+          if (!output_str.empty()) {
+            if (!output_reader.HasKey(key)) {
+              KALDI_WARN << "No new output-name for example key " << key;
+              num_err++;
+              continue;
+            }
+            std::string new_output_name = output_reader.Value(key);
+            // rename output io name to $new_output_name.
+            std::string rename_io_names = "output/" + new_output_name;
+            RenameIoNames(rename_io_names, &eg_modified);
+          }
+          example_writers[index]->Write(key, eg_modified);
           num_written++;
         } else { // the --frame option or context options were set.
           NnetExample eg_modified;
           if (SelectFromExample(eg, frame_str, left_context, right_context,
                                 frame_shift, &eg_modified)) {
+            if (!weight_str.empty()) {
+              // scale the supervision weight for egs
+              if (!egs_weight_reader.HasKey(key)) {
+                KALDI_WARN << "No weight for example key " << key;
+                num_err++;
+                continue;
+              }
+              int32 weight = egs_weight_reader.Value(key);
+              for (int32 i = 0; i < eg_modified.io.size(); i++) 
+                if (eg_modified.io[i].name == "output") 
+                  eg_modified.io[i].features.Scale(weight);
+            }
+            if (!output_str.empty()) {
+              if (!output_reader.HasKey(key)) {
+                KALDI_WARN << "No new output-name for example key " << key;
+                num_err++;
+                continue;
+              }
+              std::string new_output_name = output_reader.Value(key);
+              // rename output io name to $new_output_name.
+              std::string rename_io_names = "output/" + new_output_name;
+              RenameIoNames(rename_io_names, &eg_modified);
+            }
             // this branch of the if statement will almost always be taken (should only
             // not be taken for shorter-than-normal egs from the end of a file.
             example_writers[index]->Write(key, eg_modified);
diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc
index c419e0e0f91..e4a41933fff 100644
--- a/src/nnet3bin/nnet3-copy.cc
+++ b/src/nnet3bin/nnet3-copy.cc
@@ -44,6 +44,7 @@ int main(int argc, char *argv[]) {
     BaseFloat learning_rate = -1,
       dropout = 0.0;
     std::string nnet_config, edits_config, edits_str;
+    BaseFloat scale = 1.0;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
@@ -64,6 +65,8 @@ int main(int argc, char *argv[]) {
                 "'--edits=remove-orphans'.");
     po.Register("set-dropout-proportion", &dropout, "Set dropout proportion "
                 "in all DropoutComponent to this value.");
+    po.Register("scale", &scale, "The parameter matrices are scaled"
+                " by the specified value.");
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -85,6 +88,9 @@ int main(int argc, char *argv[]) {
     if (learning_rate >= 0)
       SetLearningRate(learning_rate, &nnet);
     
+    if (scale != 1.0)
+      ScaleNnet(scale, &nnet);
+    
     if (dropout > 0)
       SetDropoutProportion(dropout, &nnet);
 
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 8627671f53a..f214f1d60ea 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -26,13 +26,15 @@
 
 namespace kaldi {
 namespace nnet3 {
-// returns the number of indexes/frames in the NnetIo named "output" in the eg,
+// returns the number of indexes/frames in the NnetIo with output 
+// including string "output" as part of its name in the eg,
 // or crashes if it is not there.
+// e.g. output-0, output-xent
 int32 NumOutputIndexes(const NnetExample &eg) {
   for (size_t i = 0; i < eg.io.size(); i++)
-    if (eg.io[i].name == "output")
+    if (eg.io[i].name.find("output") != std::string::npos)
       return eg.io[i].indexes.size();
-  KALDI_ERR << "No output named 'output' in the eg.";
+  KALDI_ERR << "No output name with string 'output' as part of its name exists in the eg.";
   return 0;  // Suppress compiler warning.
 }