diff --git a/egs/ami/s5b/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh b/egs/ami/s5b/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
new file mode 100755
index 00000000000..00a6edb8125
--- /dev/null
+++ b/egs/ami/s5b/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+#           2017  Yiming Wang
+
+# This script is similar to rnnlm_lstm_tdnn_b.sh except for adding backstitch training
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
+# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
+# Dev objf:   -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
+
+# Begin configuration section.
+cmd=run.pl
+affix=1a
+embedding_dim=200
+embedding_l2=0.005 # embedding layer l2 regularize
+comp_l2=0.005 # component-level l2 regularize
+output_l2=0.005 # output-layer l2 regularize
+epochs=90
+mic=sdm1
+stage=-10
+train_stage=0
+# backstitch options
+alpha=0.8 # backstitch training scale
+back_interval=1 # backstitch training interval
+
+. utils/parse_options.sh
+train=data/$mic/train/text
+dev=data/$mic/dev/text
+wordlist=data/lang/words.txt
+text_dir=data/rnnlm/text
+dir=exp/rnnlm_lstm_tdnn_bs_$affix
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+ami  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
+relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  backstitch_opt="--backstitch-training-scale $alpha \
+    --backstitch-training-interval $back_interval"
+  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "queue.pl" $backstitch_opt $dir
+fi
+
+exit 0
diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index cb5756188a4..cebb2b84f16 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1f.sh
\ No newline at end of file
+tuning/run_tdnn_1g.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
new file mode 100755
index 00000000000..e234b847aa7
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# 1g is as 1f but adding dropout (well, something like dropout-- the mask
+#   is shared across time and it's continuous rather than zero-one), increasing
+#   the hidden dimension, and training for more epochs.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp
+# System                tdnn1f_sp tdnn1g_sp
+#WER dev_clean_2 (tgsmall)      14.21     13.76
+#             [online:]         14.18     13.72
+#WER dev_clean_2 (tglarge)      10.32      9.65
+#             [online:]         10.25      9.85
+# Final train prob        -0.0507   -0.0453
+# Final valid prob        -0.0912   -0.0892
+# Final train prob (xent)   -1.3550   -1.1694
+# Final valid prob (xent)   -1.6018   -1.4486
+# Num-params                 4205322   6227338
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{f,g}_sp
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 1) xent:train/valid[10,16,final]=(-1.61,-1.41,-1.36/-1.82,-1.66,-1.60) logprob:train/valid[10,16,final]=(-0.067,-0.057,-0.051/-0.106,-0.097,-0.091)
+# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2309 combine=-0.054->-0.053 (over 2) xent:train/valid[15,24,final]=(-1.49,-1.22,-1.17/-1.75,-1.51,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.050,-0.045/-0.106,-0.096,-0.089)
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1g   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=15 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t_general_dropout0.6.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t_general_dropout0.6.sh
new file mode 100644
index 00000000000..7d9b2277121
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t_general_dropout0.6.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+# note from Gaofeng:
+# 7m23t added with shared-mask dropout
+# rename orth layers to keep pace with tdnn7n
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m23t_dropout_0.6_8epoch
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.6@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
new file mode 100755
index 00000000000..0fa7353edb2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -0,0 +1,608 @@
+#!/bin/bash
+
+# 7m25l is as 7m25j but with no dropout on the prefinal layer.  Hoping to resolve
+# bad objf in middle of training.
+# Caution: in 7m25l2 there is a run which by mistake, did have dropout on the
+# prefinal layer, and which should for the most part be just a rerun of 7m25j.
+
+# This seems *maybe* slightly better than j and l2 (note: l2 is like j).
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     11.95     11.98     11.90
+# WER on train_dev(fg)      11.12     11.08     11.00     10.92
+# WER on eval2000(tg)        14.9      14.6      14.7      14.7
+# WER on eval2000(fg)        13.5      13.3      13.3      13.3
+# WER on rt03(tg)            18.4      18.1      18.1      18.0
+# WER on rt03(fg)            16.2      15.8      15.8      15.7
+# Final train prob         -0.077    -0.078    -0.078    -0.076
+# Final valid prob         -0.093    -0.091    -0.091    -0.091
+# Final train prob (xent)        -0.994    -0.987    -0.987    -0.973
+# Final valid prob (xent)       -1.0194   -1.0161   -1.0142   -1.0041
+# Num-parameters               20111396  22735140  22735140  22735140
+
+#
+# But I may have changed the training code to accept more models in averaging,
+# so that could be responsible for some of the change.
+#
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     12.13     11.98
+# WER on train_dev(fg)      11.12     11.22     11.00
+# WER on eval2000(tg)        14.9      15.0      14.7
+# WER on eval2000(fg)        13.5      13.7      13.3
+# WER on rt03(tg)            18.4      18.2      18.1
+# WER on rt03(fg)            16.2      15.7      15.8
+# Final train prob         -0.077    -0.078    -0.078
+# Final valid prob         -0.093    -0.092    -0.091
+# Final train prob (xent)        -0.994    -0.996    -0.987
+# Final valid prob (xent)       -1.0194   -1.0214   -1.0142
+# Num-parameters               20111396  22735140  22735140
+
+# 7m25j is as 7m25i but with the dropout schedule peaking at 0.5 not 0.3,
+#   and with 8 instead of 6 epochs (like g->h).
+#   This run failed due to instability.
+
+# 7m25i is as 7m25g but with dropout-per-dim-continuous=true.
+#
+# 7m25g is as 7m25f but with dim=1536 for the subsampled layers (more like 7m25d than 7m25e).
+
+# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments.
+#
+# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25l
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index be8d39de80b..e3d13ac1f65 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -4,7 +4,7 @@
 # end, and no chain l2-regularize
 #[note: was 1e12e.]
 
-# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp
+# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
 # System                tdnn1e10_sp tdnn1e12e_sp
 #WER dev93 (tgpr)                7.29      7.20
 #WER dev93 (tg)                  7.08      6.81
diff --git a/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh b/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
new file mode 100755
index 00000000000..d3b7a97e980
--- /dev/null
+++ b/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+#           2018  Yiming Wang
+
+# same as lstm_tdnn_1b, but with backstitch training.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 160) was 156, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 40.2 / 47.8.
+# Train objf: -6.47 -5.36 -5.02 -4.85 -4.73 -4.65 -4.59 -4.54 -4.49 -4.45 -4.41 -4.38 -4.35 -4.33 -4.30 -4.29 -4.27 -4.25 -4.23 -4.22 -4.20 -4.19 -4.18 -4.17 -4.16 -4.14 -4.14 -4.13 -4.11 -4.11 -4.10 -4.09 -4.09 -4.07 -4.07 -4.06 -4.05 -4.05 -4.04 -4.04 -4.03 -4.00 -3.98 -3.96 -3.98 -3.96 -3.95 -3.94 -3.95 -3.94 -3.92 -3.92 -3.92 -3.91 -3.90 -3.90 -3.91 -3.90 -3.88 -3.88 -3.89 -3.88 -3.87 -3.86 -3.87 -3.86 -3.85 -3.85 -3.85 -3.85 -3.84 -3.83 -3.84 -3.83 -3.82 -3.82 -3.83 -3.82 -3.81 -3.81 -3.82 -3.81 -3.80 -3.80 -3.80 -3.80 -3.79 -3.79 -3.79 -3.79 -3.78 -3.77 -3.78 -3.77 -3.77 -3.76 -3.77 -3.76 -3.75 -3.75 -3.75 -3.75 -3.75 -3.74 -3.74 -3.74 -3.73 -3.73 -3.73 -3.73 -3.72 -3.73 -3.73 -3.72 -3.71 -3.71 -3.71 -3.71 -3.71 -3.71 -3.72 -3.71 -3.69 -3.70 -3.69 -3.69 -3.69 -3.68 -3.68 -3.68 -3.67 -3.67 -3.67 -3.67 -3.67 -3.66 -3.66 -3.66 -3.65 -3.65 -3.65 -3.65 -3.64 -3.64 -3.64 -3.64 -3.63 -3.63 -3.63 -3.63 -3.63 -3.62 -3.63 -3.62 -3.62 -3.62 -3.62 -3.62 -3.61 -3.61 
+# Dev objf:   -11.73 -5.72 -5.18 -4.95 -4.81 -4.72 -4.65 -4.59 -4.55 -4.50 -4.47 -4.44 -4.41 -4.38 -4.36 -4.34 -4.33 -4.31 -4.30 -4.28 -4.26 -4.26 -4.25 -4.23 -4.22 -4.22 -4.22 -4.19 -4.20 -4.18 -4.19 -4.17 -4.16 -4.16 -4.16 -4.14 -4.14 -4.14 -4.13 -4.12 -4.12 -4.07 -4.06 -4.05 -4.04 -4.04 -4.04 -4.03 -4.03 -4.02 -4.02 -4.02 -4.01 -4.01 -4.01 -4.00 -4.00 -4.00 -3.99 -3.99 -3.99 -3.99 -3.98 -3.98 -3.98 -3.98 -3.98 -3.98 -3.97 -3.97 -3.97 -3.97 -3.96 -3.96 -3.96 -3.96 -3.96 -3.96 -3.95 -3.95 -3.95 -3.95 -3.95 -3.95 -3.94 -3.94 -3.94 -3.94 -3.94 -3.94 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.93 -3.93 -3.92 -3.93 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91 -3.91 -3.91 -3.90 -3.90 -3.91 -3.89 -3.89 -3.89 -3.89 -3.89 -3.89 -3.89 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.88 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87 -3.87
+
+# Begin configuration section.
+affix=1a
+embedding_dim=800
+lstm_rpd=200
+lstm_nrpd=200
+embedding_l2=0.001 # embedding layer l2 regularize
+comp_l2=0.001 # component-level l2 regularize
+output_l2=0.001 # output-layer l2 regularize
+epochs=40
+stage=-10
+train_stage=-10
+# backstitch options
+alpha=0.2 # backstitch training scale
+back_interval=1 # backstitch training interval
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+
+dir=exp/rnnlm_lstm_tdnn_bs_$affix
+text=data/local/dict_nosp_larger/cleaned.gz
+wordlist=data/lang_nosp/words.txt
+text_dir=data/rnnlm/text_nosp
+mkdir -p $dir/config
+set -e
+
+for f in $text $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 500 lines as dev data.
+  gunzip -c $text  | awk -v text_dir=$text_dir '{if(NR%500 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/wsj.txt
+fi
+
+if [ $stage -le 1 ]; then
+  # the training scripts require that <s>, </s> and <brk> be present in a particular
+  # order.
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<SPOKEN_NOISE>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+wsj   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<SPOKEN_NOISE>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features=50000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<SPOKEN_NOISE>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1)) 
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd $lstm_opts
+relu-renorm-layer name=tdnn2 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd $lstm_opts
+relu-renorm-layer name=tdnn3 dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  backstitch_opt="--backstitch-training-scale $alpha \
+    --backstitch-training-interval $back_interval"
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                       --embedding_l2 $embedding_l2 \
+                       --stage $train_stage --num-epochs $epochs --cmd "$cmd" $backstitch_opt $dir
+fi
+
+exit 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 905edc1a78b..d962e8839dd 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -391,6 +391,70 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
     return list(map(lambda x: (int(x), float(train_objf[x]),
                                float(valid_objf[x])), iters))
 
+def parse_rnnlm_prob_logs(exp_dir, key='objf'):
+    train_prob_files = "%s/log/train.*.*.log" % (exp_dir)
+    valid_prob_files = "%s/log/compute_prob.*.log" % (exp_dir)
+    train_prob_strings = common_lib.get_command_stdout(
+        'grep -e {0} {1}'.format(key, train_prob_files))
+    valid_prob_strings = common_lib.get_command_stdout(
+        'grep -e {0} {1}'.format(key, valid_prob_files))
+
+    # LOG
+    # (rnnlm-train[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
+    # Overall objf is (-4.426 + -0.008287) = -4.435 over 4.503e+06 words (weighted)
+    # in 1117 minibatches; exact = (-4.426 + 0) = -4.426
+
+    # LOG
+    # (rnnlm-compute-prob[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
+    # Overall objf is (-4.677 + -0.002067) = -4.679 over 1.08e+05 words (weighted)
+    # in 27 minibatches; exact = (-4.677 + 0.002667) = -4.674
+
+    parse_regex_train = re.compile(
+        ".*train\.([0-9]+).1.log:LOG "
+        ".rnnlm-train.*:PrintStatsOverall..:"
+        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
+        ".*exact = \(.+\) = ([0-9.\-\+e]+)")
+
+    parse_regex_valid = re.compile(
+        ".*compute_prob\.([0-9]+).log:LOG "
+        ".rnnlm.*compute-prob.*:PrintStatsOverall..:"
+        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
+        ".*exact = \(.+\) = ([0-9.\-\+e]+)")
+
+    train_objf = {}
+    valid_objf = {}
+
+    for line in train_prob_strings.split('\n'):
+        mat_obj = parse_regex_train.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                train_objf[int(groups[0])] = groups[2]
+    if not train_objf:
+        raise KaldiLogParseException("Could not find any lines with {k} in "
+                " {l}".format(k=key, l=train_prob_files))
+
+    for line in valid_prob_strings.split('\n'):
+        mat_obj = parse_regex_valid.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                valid_objf[int(groups[0])] = groups[2]
+
+    if not valid_objf:
+        raise KaldiLogParseException("Could not find any lines with {k} in "
+                " {l}".format(k=key, l=valid_prob_files))
+
+    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
+    if not iters:
+        raise KaldiLogParseException("Could not any common iterations with"
+                " key {k} in both {tl} and {vl}".format(
+                    k=key, tl=train_prob_files, vl=valid_prob_files))
+    iters.sort()
+    return map(lambda x: (int(x), float(train_objf[x]),
+                          float(valid_objf[x])), iters)
+
+
 
 
 def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
@@ -404,7 +468,10 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
     report = []
     report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference")
     try:
-        data = list(parse_prob_logs(exp_dir, key, output))
+        if key == "rnnlm_objective":
+            data = list(parse_rnnlm_prob_logs(exp_dir, 'objf'))
+        else:
+            data = list(parse_prob_logs(exp_dir, key, output))
     except:
         tb = traceback.format_exc()
         logger.warning("Error getting info from logs, exception was: " + tb)
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..eda1461a2ab 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,9 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False, # if you set this, it's
+                                                    # like dropout-per-dim but with a
+                                                    # continuous-valued (not zero-one) mask.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -864,32 +867,19 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                if not self.config['dropout-per-dim']:
+                if not (self.config['dropout-per-dim'] or
+                        self.config['dropout-per-dim-continuous']):
                     line = ('component name={0}.{1} type=DropoutComponent '
                             'dim={2} dropout-proportion={3}'.format(
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
-                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
-                    configs.append(line)
-                    # note: the input to the dropout_mask component is never used, it's
-                    # just syntactically required.
-                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
-                            'input={1}'.format(self.name, cur_node))
-                    configs.append(line)
-                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
-                            'input-dim={1} output-dim={2} '.format(
-                                self.name, 2 * output_dim, output_dim))
-                    configs.append(line)
-                    line = ('component-node name={0}.dropout component={0}.dropout '
-                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
-                            ''.format(self.name, cur_node))
-                    configs.append(line)
-                    cur_node = '{0}.dropout'.format(self.name)
-                    continue
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
 
+                    line = ('component name={0}.dropout type=GeneralDropoutComponent '
+                            'dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"
                                    .format(nonlinearity))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 87edd661a6f..6b8b1834749 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -221,7 +221,8 @@ def process_args(args):
     if (not os.path.exists(args.dir+"/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist.")
+                        "and exist; or the {0}/configs directory should exist."
+                        "".format(args.dir))
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 8ec283492ef..b645b5cafd0 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -65,6 +65,9 @@ def get_args():
     parser.add_argument("--is-chain", type=str, default=False,
                         action=common_lib.StrToBoolAction,
                         help="True if directory contains chain models")
+    parser.add_argument("--is-rnnlm", type=str, default=False,
+                        action=common_lib.StrToBoolAction,
+                        help="True if directory contains RNNLM.")
     parser.add_argument("--output-nodes", type=str, default=None,
                         action=common_lib.NullstrToNoneAction,
                         help="""List of space separated
@@ -87,6 +90,8 @@ def get_args():
             carefully tune the plot_colors variable which specified colors used
             for plotting.""")
     assert args.start_iter >= 1
+    if args.is_chain and args.is_rnnlm:
+        raise Exception("""is_chain and is_rnnlm is not compatible.""")
     return args
 
 
@@ -688,6 +693,13 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
+        elif objective_type == "rnnlm_objective":
+            logger.info("Generating RNNLM objective plots")
+            generate_acc_logprob_plots(
+                exp_dir, output_dir, g_plot, key='rnnlm_objective',
+                file_basename='objective', comparison_dir=comparison_dir,
+                start_iter=start_iter,
+                latex_report=latex_report, output_name=output_name)
         else:
             logger.info("Generating " + objective_type + " objective plots")
             generate_acc_logprob_plots(
@@ -732,6 +744,8 @@ def main():
             output_nodes.append(tuple(parts))
     elif args.is_chain:
         output_nodes.append(('output', 'chain'))
+    elif args.is_rnnlm:
+        output_nodes.append(('output', 'rnnlm_objective'))
     else:
         output_nodes.append(('output', 'linear'))
 
diff --git a/misc/docker/centos/Dockerfile b/misc/docker/centos/Dockerfile
new file mode 100644
index 00000000000..304951fa4e0
--- /dev/null
+++ b/misc/docker/centos/Dockerfile
@@ -0,0 +1,32 @@
+FROM centos:latest
+
+MAINTAINER sih4sing5hong5
+
+ENV CPU_CORE 4
+
+RUN yum update -y 
+RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools" "System Tools"
+RUN  yum install -y \
+    git bzip2 wget subversion which \
+    gcc-c++ make automake autoconf zlib-devel atlas-static \
+	 python
+
+## How To Install Python 3 and Set Up a Local Programming Environment on CentOS 7 | DigitalOcean
+## https://www.digitalocean.com/community/tutorials/how-to-install-python-3-and-set-up-a-local-programming-environment-on-centos-7
+RUN yum -y install https://centos7.iuscommunity.org/ius-release.rpm
+RUN yum -y install python36u
+RUN ln -s /usr/bin/python3.6 /usr/bin/python3
+
+WORKDIR /usr/local/
+# Use the newest kaldi version
+RUN git clone https://github.com/kaldi-asr/kaldi.git
+
+
+WORKDIR /usr/local/kaldi/tools
+RUN extras/check_dependencies.sh
+RUN make -j $CPU_CORE
+
+
+WORKDIR /usr/local/kaldi/src
+RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
+
diff --git a/misc/docker/fedora/Dockerfile b/misc/docker/fedora/Dockerfile
new file mode 100644
index 00000000000..68f2d9504c7
--- /dev/null
+++ b/misc/docker/fedora/Dockerfile
@@ -0,0 +1,30 @@
+FROM fedora:latest
+
+MAINTAINER sih4sing5hong5
+
+ENV CPU_CORE 4
+
+RUN yum update -y
+RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools"
+RUN  yum install -y \
+    git bzip2 wget subversion \
+    gcc-c++ make automake autoconf zlib-devel \
+    python python3
+
+
+WORKDIR /usr/local/
+# Use the newest kaldi version
+RUN git clone https://github.com/kaldi-asr/kaldi.git
+
+
+WORKDIR /usr/local/kaldi/tools
+
+RUN extras/check_dependencies.sh
+# RUN yum groupinstall -y "System Tools"
+RUN make -j $CPU_CORE
+
+#    libatlas-dev libatlas-base-dev
+
+WORKDIR /usr/local/kaldi/src
+RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
+
diff --git a/misc/docker/ubuntu/Dockerfile b/misc/docker/ubuntu/Dockerfile
new file mode 100644
index 00000000000..3199b360b6f
--- /dev/null
+++ b/misc/docker/ubuntu/Dockerfile
@@ -0,0 +1,28 @@
+FROM ubuntu:latest
+
+MAINTAINER sih4sing5hong5
+
+ENV CPU_CORE 4
+
+RUN \
+  apt-get update -qq && \
+  apt-get install -y \
+    git bzip2 wget \
+    g++ make python python3 \
+    zlib1g-dev automake autoconf libtool subversion \
+    libatlas-dev libatlas-base-dev
+
+
+WORKDIR /usr/local/
+# Use the newest kaldi version
+RUN git clone https://github.com/kaldi-asr/kaldi.git
+
+
+WORKDIR /usr/local/kaldi/tools
+RUN extras/check_dependencies.sh
+RUN make -j $CPU_CORE
+
+WORKDIR /usr/local/kaldi/src
+RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
+
+
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index e757508990b..506527f4f6b 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -17,6 +17,7 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
                   # exploding exponentially. Details of the n-gram approximation
                   # method are described in section 2.3 of the paper
                   # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm
+max_arcs=         # limit the max arcs in lattice while rescoring. E.g., 20000
 
 acwt=0.1
 weight=0.5  # Interpolation weight for RNNLM.
@@ -85,13 +86,18 @@ else
   word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'"
 fi
 
+max_arcs_opt=
+if [ ! -z "$max_arcs" ]; then
+  max_arcs_opt="--max-arcs=$max_arcs"
+fi
+
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
 cp $indir/num_jobs $outdir
 
 $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
   lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \
-    --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt \
+    --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \
     $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \
     "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
 
diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index 2a05e54b0a9..f76d02cfd41 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -23,6 +23,8 @@ final_effective_lrate=0.0001
 embedding_l2=0.005
 embedding_lrate_factor=0.1  # the embedding learning rate is the
                             # nnet learning rate times this factor.
+backstitch_training_scale=0.0    # backstitch training scale
+backstitch_training_interval=1   # backstitch training interval
 cmd=run.pl  # you might want to set this to queue.pl
 
 # some options passed into rnnlm-get-egs, relating to sampling.
@@ -117,6 +119,8 @@ final_effective_lrate=$final_effective_lrate
 embedding_lrate_factor=$embedding_lrate_factor
 sample_group_size=$sample_group_size
 num_samples=$num_samples
+backstitch_training_scale=$backstitch_training_scale
+backstitch_training_interval=$backstitch_training_interval
 EOF
 
 
@@ -139,6 +143,10 @@ while [ $x -lt $num_iters ]; do
     fi
     if $use_gpu_for_diagnostics; then queue_gpu_opt="--gpu 1"; gpu_opt="--use-gpu=yes";
     else gpu_opt=''; queue_gpu_opt=''; fi
+    backstitch_opt="--rnnlm.backstitch-training-scale=$backstitch_training_scale \
+      --rnnlm.backstitch-training-interval=$backstitch_training_interval \
+      --embedding.backstitch-training-scale=$backstitch_training_scale \
+      --embedding.backstitch-training-interval=$backstitch_training_interval"
     [ -f $dir/.error ] && rm $dir/.error
     $cmd $queue_gpu_opt $dir/log/compute_prob.$x.log \
        rnnlm-get-egs $(cat $dir/special_symbol_opts.txt) \
@@ -192,7 +200,7 @@ while [ $x -lt $num_iters ]; do
              --embedding.max-param-change=$embedding_max_change \
              --embedding.learning-rate=$embedding_lrate \
              --embedding.l2_regularize=$embedding_l2_regularize \
-             $sparse_opt $gpu_opt \
+             $sparse_opt $gpu_opt $backstitch_opt \
              --read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \
              --read-embedding=$dir/${embedding_type}_embedding.$x.mat \
              --write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 620ea873eb7..c936061de26 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -33,10 +33,9 @@ DenominatorComputation::DenominatorComputation(
     den_graph_(den_graph),
     num_sequences_(num_sequences),
     frames_per_sequence_(nnet_output.NumRows() / num_sequences_),
-    exp_nnet_output_transposed_(nnet_output, kTrans),
     nnet_output_deriv_transposed_(
-        exp_nnet_output_transposed_.NumRows(),
-        std::min<int32>(exp_nnet_output_transposed_.NumCols(),
+        nnet_output.NumCols(),
+        std::min<int32>(nnet_output.NumRows(),
                         static_cast<int32>(kMaxDerivTimeSteps) *
                         num_sequences_)),
     alpha_(frames_per_sequence_ + 1,
@@ -57,6 +56,14 @@ DenominatorComputation::DenominatorComputation(
                  num_sequences_).SetZero();
 
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
+  // the kStrideEqualNumCols argument means we'll allocate a contiguous block of
+  // memory for this; it is added to ensure that the same block of memory
+  // (cached in the allocator) can be used for xent_output_deriv when allocated
+  // from chain-training.cc.
+  exp_nnet_output_transposed_.Resize(nnet_output.NumCols(),
+                                     nnet_output.NumRows(),
+                                     kUndefined, kStrideEqualNumCols);
+  exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans);
   exp_nnet_output_transposed_.ApplyExp();
 }
 
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index bf61bed67f0..f4b0d110373 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -52,8 +52,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
-  if (xent_output_deriv != NULL)
-    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+  if (xent_output_deriv != NULL) {
+    // the reason for kStrideEqualNumCols is so that we can share the memory
+    // block with the memory that was used for exp_nnet_output_transposed_ from
+    // chain-denominator.cc, which has just been freed; it also uses the
+    // kStrideEqualNumCols arg (its shape is the transpose of this matrix's
+    // shape).
+    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                              kSetZero, kStrideEqualNumCols);
+  }
 
 
   {
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 8ab03c7e14e..f2926ddc2f1 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -143,6 +143,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
 void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride);
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride);
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim);
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ae7e25b716d..50dd3d1d0ca 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1984,6 +1984,23 @@ static void _add_rows(Real alpha, Real* dst, const Real *src,
   }
 }
 
+template<typename Real>
+__global__
+static void _mul_rows(Real* dst, const Real *src,
+                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                      int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    if (reorder[j] >= 0) {
+      int src_index = reorder[j] * src_stride + i;
+      dst[dst_index] *= src[src_index];
+    }
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _add_rows(Real alpha, Real* dst, const Real * const *src,
@@ -3764,6 +3781,12 @@ void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
                            const float* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
@@ -4454,6 +4477,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 3518e0c71ed..fe706815a44 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -221,20 +221,30 @@ inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* const * src, MatrixDim dst_dim) {
   cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* const * src, MatrixDim dst_dim) {
+  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
+}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* const * src, MatrixDim dst_dim) {
-  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
-}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
                           const float* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                          const double* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, float* dst,
+                          const float* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
                           double alpha, const int* smat_row_ptr,
                           const int* smat_col_idx, const double* smat_val) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 909e5552a35..33db8b3e625 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -534,6 +534,42 @@ static void UnitTestCuMatrixAddRows() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixMulRows() {
+  for (int32 p = 0; p < 2; p++) {
+    MatrixIndexT num_rows1 = 10 + Rand() % 10,
+        num_rows2 = 10 + Rand() % 10,
+        num_cols = 10 + Rand() % 10;
+    CuMatrix<Real> M(num_rows1, num_cols);
+    M.SetRandn();
+
+    CuMatrix<Real> N1(num_rows2, num_cols),
+        O(num_rows2, num_cols);
+    std::vector<int32> reorder(num_rows2);
+    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    for (int32 i = 0; i < num_rows2; i++) {
+      reorder[i] = -1 + (Rand() % (num_rows1 + 1));
+      if (reorder[i] != -1)
+        reorder_src[i] = M.RowData(reorder[i]);
+    }
+
+    CuArray<int32> reorder_cuda(reorder);
+    N1.MulRows(M, reorder_cuda);
+
+    for (int32 i = 0; i < num_rows2; i++) {
+      if (reorder[i] != -1) {
+        CuSubVector<Real> O_row(O, i),
+            M_row(M, reorder[i]);
+        O_row.MulElements(M_row);
+      }
+    }
+
+    AssertEqual(N1, O);
+  }
+}
+
+
+
 template<typename Real>
 static void UnitTestCuMatrixAddToRows() {
   for (int32 p = 0; p < 2; p++) {
@@ -2914,6 +2950,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixCopyColsFromVec<Real>();
   UnitTestCuMatrixCopyToRows<Real>();
   UnitTestCuMatrixAddRows<Real>();
+  UnitTestCuMatrixMulRows<Real>();
   UnitTestCuMatrixAddToRows<Real>();
   UnitTestCuMatrixAddRowRanges<Real>();
   UnitTestCuMatrixAddTpMat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 813c5e75d14..34290561cc5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2722,6 +2722,41 @@ void CuMatrixBase<Real>::AddRows(Real alpha,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::MulRows(const CuMatrixBase<Real> &src,
+                                 const CuArrayBase<MatrixIndexT> &indexes) {
+  if (NumRows() == 0) return;
+  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(src.NumCols() == NumCols());
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_mul_rows(dimGrid, dimBlock,
+                  data_, src.Data(), indexes.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    MatrixBase<Real> &this_mat(Mat());
+    const MatrixBase<Real> &src_mat(src.Mat());
+    int32 num_rows = NumRows();
+    const MatrixIndexT *index_ptr = indexes.Data();
+    for (int32 r = 0; r < num_rows; r++) {
+      int32 src_r = index_ptr[r];
+      if (src_r < 0)
+        continue;
+      SubVector<Real> this_row(this_mat, r),
+          src_row(src_mat, src_r);
+      this_row.MulElements(src_row);
+    }
+  }
+}
+
+
 
 template<typename Real>
 void CuMatrixBase<Real>::AddRows(Real alpha, const CuArrayBase<const Real*> &src) {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 7c3a2a2e11f..86c50cfc485 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -139,6 +139,15 @@ class CuMatrixBase {
                const CuMatrixBase<Real> &src,
                const CuArrayBase<MatrixIndexT> &indexes);
 
+
+  /// Does for each row r, this.Row(r) *= alpha * src.row(indexes[r]),
+  /// where '*=' is elementwise multiplication.
+  /// If indexes[r] < 0, does not add anything.
+  /// src.NumCols() must equal this.NumCols()
+  void MulRows(const CuMatrixBase<Real> &src,
+               const CuArrayBase<MatrixIndexT> &indexes);
+
+
   /// Does for each row r, this.Row(r) += alpha * src[r],
   /// treating src[r] as the beginning of a region of memory representing
   /// a vector of floats, of the same length as this.NumCols().
diff --git a/src/cudamatrix/cu-rand.cc b/src/cudamatrix/cu-rand.cc
index 9d55a3f655e..20439834a98 100644
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@@ -68,7 +68,8 @@ void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
     // may vary).
     CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                        kStrideEqualNumCols);
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), tmp.NumRows() * tmp.Stride()));
+    size_t s = static_cast<size_t>(tmp.NumRows()) * static_cast<size_t>(tmp.Stride());
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), s));
     tgt->CopyFromMat(tmp);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -84,7 +85,8 @@ void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     // Here we don't need to use 'tmp' matrix,
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->NumRows() * tgt->Stride()));
+    size_t s = static_cast<size_t>(tgt->NumRows()) * static_cast<size_t>(tgt->Stride());
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), s));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2080c60077b..844fb82d32a 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -92,18 +92,101 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   num_minibatches_processed_++;
 }
 
+// This object exists to help avoid memory fragmentation: it allocates,
+// but does not use, the exact sizes of memory that are going to be needed
+// in ComputeChainObjfAndDeriv().
+class ChainTrainerMemoryHolder {
+ public:
+  ChainTrainerMemoryHolder(const Nnet &nnet,
+                           int32 num_den_graph_states,
+                           const NnetChainExample &eg);
+ private:
+  CuMatrix<BaseFloat> nnet_output_deriv_;
+  CuMatrix<BaseFloat> xent_output_deriv_;
+  CuMatrix<BaseFloat> beta_;
+  CuMatrix<BaseFloat> alpha_;
+
+};
+
+ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet,
+                                                   int32 den_graph_states,
+                                                   const NnetChainExample &eg) {
+
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+
+  int32 max_rows = 0,
+      max_cols = 0;
+
+  size_t max_frames_per_sequence = 0,
+         max_sequence_size = 0,
+         max_alpha_matrix_size = 0;
+
+  for (; iter != end; ++iter) {
+    // there will normally be just one of these things; we'll normally loop once.
+    const NnetChainSupervision &sup = *iter;
+
+    int32 output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
+    int32 output_cols = nnet.OutputDim("output");
+
+    size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
+    size_t den_graph_size = den_graph_states + 1;
+    size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
+    size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;
+
+    if (curr_alpha_matrix_size > max_alpha_matrix_size) {
+      max_alpha_matrix_size = curr_alpha_matrix_size;
+      max_frames_per_sequence = curr_frames_per_sequence;
+      max_sequence_size = curr_sequence_size;
+    }
+
+    size_t matrix_size = output_rows * output_cols;
+    if (matrix_size > (max_rows * max_cols)) {
+      max_rows = output_rows;
+      max_cols = output_cols;
+    }
+  }
+
+  // the sequence of resizes is in a specific order (bigger to smaller)
+  // so that the cudaMalloc won't trash the memory it has already
+  // alloc'd in the previous iterations
+  alpha_.Resize(max_frames_per_sequence,
+                max_sequence_size,
+                kUndefined);
+
+
+  nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);
+  // note: the same block of memory can be used for xent_output_deriv_ as is
+  // used for exp_nnet_output_transposed_ in chain-training.cc.
+  xent_output_deriv_.Resize(max_rows, max_cols,
+                            kUndefined, kStrideEqualNumCols);
+
+  beta_.Resize(2, max_sequence_size, kUndefined);
+}
+
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
+
+  // reserve the memory needed in ProcessOutputs (before memory gets fragmented
+  // by the call to computer.Run().
+  ChainTrainerMemoryHolder *memory_holder =
+      new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg);
+
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
 
+  // 'this->ProcessOutputs()' is going to need the same sizes as are stored in
+  // 'memory_holder'.
+  delete memory_holder;
+
+  // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
@@ -140,7 +223,7 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index ce4bbd0940a..c73f3fb921d 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -66,6 +66,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new TimeHeightConvolutionComponent::PrecomputedIndexes();
   } else if (cpi_type == "RestrictedAttentionComponentPrecomputedIndexes") {
     ans = new RestrictedAttentionComponent::PrecomputedIndexes();
+  } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") {
+    ans = new GeneralDropoutComponentPrecomputedIndexes();
   }
   if (ans != NULL) {
     KALDI_ASSERT(cpi_type == ans->Type());
@@ -158,6 +160,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutComponent();
   } else if (component_type == "DropoutMaskComponent") {
     ans = new DropoutMaskComponent();
+  } else if (component_type == "GeneralDropoutComponent") {
+    ans = new GeneralDropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index c34d550d681..79a1f1a5602 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -588,7 +588,7 @@ class UpdatableComponent: public Component {
                      self-repair mechanism is activated.  -1000 is a special value which
                      will cause a component-specific default to be used.
 
-       block-dim     Defaults to dim, but may be any nonzero divisor of dim.  It affects the
+       block-dim     Defaults to dim, but may be any divisor of dim.  It affects the
                      self-repair, which will be done while treating the input/output as
                      repeating blocks of size 'block-dim' (e.g. blocks of filters).  It allows
                      us to do self-repair on the filter level in CNNs.
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index d7595378c1f..39bd156e360 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -41,7 +41,7 @@ bool CheckStringsApproxEqual(const std::string &a,
                              int32 tolerance = 3) {
   if (!StringsApproxEqual(a, b, tolerance)) {
     KALDI_WARN << "Strings differ: " << a
-               << "\vs.\n" << b;
+               << "\nvs.\n" << b;
     return false;
   } else {
     return true;
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 98aed592a62..bb0e7c917fc 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -636,8 +636,8 @@ static void PrintCommand(std::ostream &os_out,
         KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16);
         compressed_matrix_type = "uint16";
       }
-      os << "CompressMatrix(" << submatrix_strings[c.arg1]
-         << ", " << range << ", " << compressed_matrix_type << ", "
+      os << "CompressMatrix(" << submatrix_strings[c.arg1] << ", "
+         << range << ", " << compressed_matrix_type << ", "
          << truncate << ")\n";
       break;
     }
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 19eecdda72b..cae6f41f5f2 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -399,8 +399,8 @@ void NnetComputer::ExecuteCommand() {
           compressed_matrices_[m]->CopyFromMat(matrices_[m]);
           matrices_[m].Resize(0, 0);
         }
-        break;
 #endif
+        break;
       case kDecompressMatrix:
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..652501ae87c 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,11 +1507,314 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
+  test_mode_ = false;
+  cfl->GetValue("test-mode", &test_mode_);
+}
+
+
+std::string GeneralDropoutComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", dim=" << dim_
+         << ", block-dim=" << block_dim_
+         << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
+  if (per_frame_)
+    stream << ", per_frame=true";
+  if (time_period_ > 0)
+    stream << ", time-period=" << time_period_;
+  return stream.str();
+}
+
+GeneralDropoutComponent::GeneralDropoutComponent():
+    dim_(-1), block_dim_(-1), time_period_(0),
+    dropout_proportion_(0.5), continuous_(false), per_frame_(false) { }
+
+GeneralDropoutComponent::GeneralDropoutComponent(
+    const GeneralDropoutComponent &other):
+    dim_(other.dim_),
+    block_dim_(other.block_dim_),
+    time_period_(other.time_period_),
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_),
+    per_frame_(other.per_frame_) { }
+
+void* GeneralDropoutComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+
+  KALDI_ASSERT(SameDim(in, *out));
+
+  // The following will do nothing if 'out' and 'in' refer to the same data.
+  out->CopyFromMat(in);
+
+  if (test_mode_ || dropout_proportion_ == 0.0)
+    return NULL;
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL);
+
+  CuMatrix<BaseFloat> *mask = GetMemo(indexes->num_mask_rows);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(out->Stride() == out->NumCols());
+    int32 num_rows = out->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> out_reshaped(out->Data(), block_dim_,
+                                        num_rows_reshaped,
+                                        num_rows_reshaped);
+    out_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    out->MulRows(*mask, indexes->indexes);
+  }
+  return mask;
+}
+
+void GeneralDropoutComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv));
+
+  // The following will do no work if in_deriv->Data() == out_deriv.Data().
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || dropout_proportion_ == 0.0) {
+    KALDI_ASSERT(memo == NULL);
+    return;
+  }
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+     dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && memo != NULL);
+  CuMatrix<BaseFloat> *mask = reinterpret_cast<CuMatrix<BaseFloat>*>(memo);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(in_deriv->Stride() == in_deriv->NumCols());
+    int32 num_rows = in_deriv->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(), block_dim_,
+                                             num_rows_reshaped,
+                                             num_rows_reshaped);
+    in_deriv_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    in_deriv->MulRows(*mask, indexes->indexes);
+  }
+}
+
+void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<GeneralDropoutComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<TimePeriod>");
+  ReadBasicType(is, binary, &time_period_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  if (PeekToken(is, binary) == 'P') {
+    ExpectToken(is, binary, "<PerframeDropout>");
+    per_frame_ = true;
+  } else {
+    per_frame_ = false;
+  } 
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
+    test_mode_ = true;
+  } else {
+    test_mode_ = false;
+  }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</GeneralDropoutComponent>");
+}
+
+
+void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<GeneralDropoutComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<TimePeriod>");
+  WriteBasicType(os, binary, time_period_);
+  WriteToken(os, binary, "<DropoutProportion>");
+  WriteBasicType(os, binary, dropout_proportion_);
+  if (per_frame_)
+    WriteToken(os, binary, "<PerframeDropout>");
+  if (test_mode_)
+    WriteToken(os, binary, "<TestMode>");
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
+  WriteToken(os, binary, "</GeneralDropoutComponent>");
+}
+
+Component* GeneralDropoutComponent::Copy() const {
+  return new GeneralDropoutComponent(*this);
+}
+
+void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = 0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  KALDI_ASSERT(ok && dim_ > 0);
+  block_dim_ = dim_;
+  cfl->GetValue("block-dim", &block_dim_);
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0))
+    KALDI_ERR << "Invalid configuration dim=" << dim_
+              << ", block-dim=" << block_dim_;
+  time_period_ = 0;
+  cfl->GetValue("time-period", &time_period_);
+  dropout_proportion_ = 0.5;
+  cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  per_frame_ = false;
+  cfl->GetValue("dropout-per-frame", &per_frame_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
 
 
+CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
+    int32 num_mask_rows) const {
+  KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ &&
+               dropout_proportion_ > 0.0);
+  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_);
+  BaseFloat dropout_proportion = dropout_proportion_;
+
+  // This const_cast is only safe assuming you don't attempt
+  // to use multi-threaded code with the GPU.
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(ans);
+
+  if (!continuous_) {
+    if (!per_frame_) {
+      ans->Add(-dropout_proportion);
+      // now, a proportion "dropout_proportion" will be < 0.0. After applying the
+      // function (x>0?1:0), a proportion "dropout_proportion" will be zero and (1 -
+      // dropout_proportion) will be 1.0.
+      ans->ApplyHeaviside();
+      ans->Scale(1.0 / dropout_proportion);
+    } else {
+      // randomize the dropout matrix by row,
+      // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]]
+      CuMatrix<BaseFloat> tmp(1, num_mask_rows, kUndefined);
+      // This const_cast is only safe assuming you don't attempt
+      // to use multi-threaded code with the GPU.
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
+      tmp.Add(-dropout_proportion);
+      tmp.ApplyHeaviside();
+      ans->CopyColsFromVec(tmp.Row(0));
+    }
+  } else {
+    if (!per_frame_) {
+      ans->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      ans->Add(1.0 - (2.0 * dropout_proportion));
+    } else {
+      CuMatrix<BaseFloat> tmp(1, num_mask_rows, kUndefined);
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
+      tmp.Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      tmp.Add(1.0 - (2.0 * dropout_proportion));
+      ans->CopyColsFromVec(tmp.Row(0));
+    }
+  }
+  return ans;
+}
+
+ComponentPrecomputedIndexes* GeneralDropoutComponent::PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const {
+  KALDI_ASSERT(input_indexes == output_indexes);
+
+  GeneralDropoutComponentPrecomputedIndexes *ans = new
+      GeneralDropoutComponentPrecomputedIndexes;
+  int32 size = input_indexes.size(), time_period = time_period_,
+      cur_row = 0;
+  std::vector<int32> indexes(size);
+  // the map 'm' will map from a pair from (n, t) value to the row-index of the
+  // dropout-mask matrix*.   However, the 't' isn't a real 't' value;
+  // if time_period_ == 0, the 't' value will just be zero; otherwise,
+  // it will be t divided by time_period_ (rounding towards negative infinity).
+
+  // *before considering effects related to when block_dim_ != dim_.
+
+  std::unordered_map<std::pair<int32,int32>, int32, PairHasher<int32> > m;
+  for (int32 i = 0; i < size; i++) {
+    int32 n = input_indexes[i].n,
+        t = (time_period == 0 ? 0 : DivideRoundingDown(input_indexes[i].t,
+                                                       time_period));
+    std::pair<int32, int32> p(n, t);
+
+    std::unordered_map<std::pair<int32,int32>, int32,
+                       PairHasher<int32> >::const_iterator
+        iter = m.find(p);
+    if (iter != m.end()) {
+      indexes[i] = iter->second;
+    } else {
+      m[p] = cur_row;
+      indexes[i] = cur_row;
+      cur_row++;
+    }
+  }
+  int32 multiple = dim_ / block_dim_;
+  ans->num_mask_rows = cur_row;
+  if (multiple == 1) {
+    ans->indexes.CopyFromVec(indexes);
+  } else {
+    ans->num_mask_rows = cur_row * multiple;
+    std::vector<int32> repeated_indexes;
+    repeated_indexes.reserve(size * multiple);
+    for (int32 i = 0; i < size; i++) {
+      int32 row = indexes[i];
+      for (int32 j = 0; j < multiple; j++)
+        repeated_indexes.push_back(row);
+    }
+    ans->indexes.CopyFromVec(repeated_indexes);
+  }
+  return ans;
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Write(std::ostream &os,
+    bool binary) const {
+  WriteToken(os, binary,
+             "<GeneralDropoutComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<NumMaskRows>");
+  WriteBasicType(os, binary, num_mask_rows);
+  WriteToken(os, binary, "<Indexes>");
+  indexes.Write(os, binary);
+  WriteToken(os, binary,
+             "</GeneralDropoutComponentPrecomputedIndexes>");
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is,
+    bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<GeneralDropoutComponentPrecomputedIndexes>",
+                       "<NumMaskRows>");
+  ReadBasicType(is, binary, &num_mask_rows);
+  ExpectToken(is, binary, "<Indexes>");
+  indexes.Read(is, binary);
+  ExpectToken(is, binary,
+              "</GeneralDropoutComponentPrecomputedIndexes>");
+}
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..9cfafcbb48f 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,12 +776,184 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
 
 
 
+/**
+   GeneralDropoutComponent implements dropout, including a continuous
+   variant where the thing we multiply is not just zero or one, but may
+   be a continuous value.  It is intended for the case where you want to
+   either share the dropout mask across all of time, or across groups
+   of 't' values (e.g. the first block of 10 values gets one dropout
+   mask, the second block of 10 gets another one, and so on).
+
+
+   Configuration values accepted on the command line, with defaults:
+
+       dim        Dimension of the input and output of this component,
+                  e.g. 512
+
+       block-dim  Block size if you want the dropout mask to repeat,
+                  e.g. if dim=512 and you sent block-dim=128, there will
+                  be a mask of dimension 128 repeated 4 times.  This can
+                  be useful in convolutional setups.  If not specified,
+                  block-dim defaults to 'dim'; if specified, it must be
+                  a divisor of 'dim'.
+
+       dropout-proportion=0.5   For conventional dropout, this is the proportion
+                  of mask values that (in expectation) are zero; it would
+                  normally be between 0 and 0.5.  The nonzero mask values
+                  will be given values 1.0 / dropout_proportion, so that the
+                  expected value is 1.0.  This behavior is different from
+                  DropoutComponent and DropoutMaskComponent.
+
+                  For continuous dropout (continuous==true), the dropout scales
+                  will have values (1.0 + 2 * dropout-proportion *
+                  Uniform[-1,1]).  This might seem like a strange choice, but it
+                  means that dropout-proportion=0.5 gives us a kind of
+                  'extremal' case where the dropout scales are distributed as
+                  Uniform[0, 2] and we can pass in the dropout scale as if it
+                  were a conventional dropout scale.
+
+       time-period=0   This determines how the dropout mask interacts
+                  with the time index (t).  In all cases, different sequences
+                  (different 'n' values) get different dropout masks.
+                  If time-period==0, then the dropout mask is shared across
+                  all time values.  If you set time-period > 0, then the
+                  dropout mask is shared across blocks of time values: for
+                  instance if time-period==10, then we'll use one dropout
+                  mask for t values 0 through 9, another for 10 through 19,
+                  and so on.  In all cases, the dropout mask will be shared
+                  across all 'x' values, although in most setups the x values
+                  are just zero so this isn't very interesting.
+                  If you set time-period==1 it would be similar to regular
+                  dropout, and it would probably make more sense to just use the
+                  normal DropoutComponent.
+
+ */
+class GeneralDropoutComponent: public RandomComponent {
+ public:
+  virtual int32 InputDim() const { return dim_; }
+
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  GeneralDropoutComponent();
+
+  GeneralDropoutComponent(const GeneralDropoutComponent &other);
+
+  virtual std::string Type() const { return "GeneralDropoutComponent"; }
+  virtual int32 Properties() const {
+    return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo|
+        (block_dim_ != dim_ ? (kInputContiguous|kOutputContiguous) : 0);
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void DeleteMemo(void *memo) const {
+    delete static_cast<CuMatrix<BaseFloat>*>(memo);
+  }
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; }
+
+ private:
+
+  // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'.  This
+  // should not be called if test_mode_ is true or dropout_proportion_ is zero.
+  CuMatrix<BaseFloat> *GetMemo(int32 num_mask_rows) const;
+
+
+  // The input and output dimension
+  int32 dim_;
+
+  // block_dim_ must divide dim_.
+  int32 block_dim_;
+
+  // time_period_ can be zero if we want all 't' values to share the same
+  // dropout mask, and a value more than zero if we want blocks of 't' values to
+  // share the dropout mask.  For example, if time_period_ is 10, blocks of size
+  // 10 frames will share the same dropout mask.
+  int32 time_period_;
+
+  BaseFloat dropout_proportion_;
+
+  bool continuous_;
+
+  bool per_frame_;
+
+  bool test_mode_;
+
+  const GeneralDropoutComponent &operator
+  = (const GeneralDropoutComponent &other); // Disallow.
+};
+
+// This stores some precomputed indexes for GeneralDropoutComponent.
+// This object is created for every instance of the Propagate()
+// function in the compiled computation.
+class GeneralDropoutComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+
+  // num_mask_rows is the number of rows in the dropout-mask matrix;
+  // it's num-cols is the block_dim_ of the component.
+  int32 num_mask_rows;
+
+  // 'indexes' is of dimension (the number of rows in the matrix we're doing
+  // Propagate() or Backprop() on) times the (dim_ / block_dim_) of the
+  // GeneralDropoutComponent.  Each value is in the range [0, num_mask_rows-1],
+  // and each value is repeated (dim_ / block_dim_) times.  This array is used
+  // to multiply the reshaped values or derivatives by the appropriate rows of
+  // the dropout matrix.
+  CuArray<int32> indexes;
+
+  virtual ~GeneralDropoutComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new GeneralDropoutComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const {
+    return "GeneralDropoutComponentPrecomputedIndexes";
+  }
+};
+
+
+
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 1806fe38493..37ad624d0f0 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -58,7 +58,7 @@ namespace nnet3 {
    Configuration values accepted:
       dim, or input-dim    Input dimension of this component, e.g. 1024.
                            Will be the same as the output dimension if add-log-stddev=false.
-      block-dim            Defaults to 'dim' you may specify a nonzero divisor
+      block-dim            Defaults to 'dim' you may specify a divisor
                            of 'dim'.  In this case the input dimension will
                            be interpreted as blocks of dimension 'block-dim'
                            to which the nonlinearity described above is applied
@@ -144,11 +144,11 @@ class NormalizeComponent: public Component {
 
     Accepted configuration values:
            dim          Dimension of the input and output
-           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+           block-dim    Defaults to 'dim', but may be set to a divisor
                         of 'dim'.  In this case, each block of dimension 'block-dim'
                         is treated like a separate row of the input matrix, which
                         means that the stats from n'th element of each
-                        block are pooled into one class, for each n.a
+                        block are pooled into one class, for each n.
            epsilon      Small term added to the variance that is used to prevent
                         division by zero
            target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 6bff30c501b..812b66c41b1 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -90,7 +90,7 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                                 const NnetComputation &computation) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
@@ -131,7 +131,7 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
                                           bool is_backstitch_step1) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index fd2229cace8..afe624f94ca 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -486,6 +486,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
         dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
     if (mc != NULL)
       mc->SetDropoutProportion(dropout_proportion);
+    GeneralDropoutComponent *gdc =
+        dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
+    if (gdc != NULL)
+      gdc->SetDropoutProportion(dropout_proportion);
   }
 }
 
@@ -1172,12 +1176,17 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c));
           DropoutMaskComponent *mask_component =
              dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
+          GeneralDropoutComponent *general_dropout_component =
+             dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
           if (dropout_component != NULL) {
             dropout_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
           } else if (mask_component != NULL){
             mask_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
+          } else if (general_dropout_component != NULL){
+            general_dropout_component->SetDropoutProportion(proportion);
+            num_dropout_proportions_set++;
           }
         }
       }
@@ -1461,9 +1470,10 @@ class ModelCollapser {
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
-     the case where 'component_index1' is of type DropoutComponent, and where
-     'component_index2' is of type AffineComponent,
-     NaturalGradientAffineComponent or TimeHeightConvolutionComponent.
+     the case where 'component_index1' is of type DropoutComponent or
+     GeneralDropoutComponent, and where 'component_index2' is of type
+     AffineComponent, NaturalGradientAffineComponent or
+     TimeHeightConvolutionComponent.
 
      Returns -1 if this code can't produce a combined component (normally
      because the components have the wrong types).
@@ -1473,10 +1483,23 @@ class ModelCollapser {
     const DropoutComponent *dropout_component =
         dynamic_cast<const DropoutComponent*>(
             nnet_->GetComponent(component_index1));
-    if (dropout_component == NULL)
+    const GeneralDropoutComponent *general_dropout_component =
+        dynamic_cast<const GeneralDropoutComponent*>(
+            nnet_->GetComponent(component_index1));
+
+    if (dropout_component == NULL && general_dropout_component == NULL)
       return -1;
-    BaseFloat dropout_proportion = dropout_component->DropoutProportion();
-    BaseFloat scale = 1.0 / (1.0 - dropout_proportion);
+    BaseFloat scale;  // the scale we have to apply to correct for removing
+                      // this dropout comonent.
+    if (dropout_component != NULL) {
+      BaseFloat dropout_proportion = dropout_component->DropoutProportion();
+      scale = 1.0 / (1.0 - dropout_proportion);
+    } else {
+      // for GeneralDropoutComponent, it's done in such a way that the expectation
+      // is always 1.  (When it's nonzero, we give it a value 1/(1-dropout_proportion).
+      // So no scaling is needed.
+      scale = 1.0;
+    }
     // note: if the 2nd component is not of a type that we can scale, the
     // following function call will return -1, which is OK.
     return GetScaledComponentIndex(component_index2,
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index efa36e1f64c..4b105e30beb 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -189,7 +189,7 @@ void RecomputeStats(const std::vector<NnetExample> &egs, Nnet *nnet);
 
 
 /// This function affects components of child-classes of
-/// RandomComponent( currently only DropoutComponent and DropoutMaskComponent).
+/// RandomComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed often).
 /// "test mode" means that having a mask containing (1-dropout_prob) in all
@@ -296,7 +296,8 @@ void CollapseModel(const CollapseModelConfig &config,
        'remove-orphans'.
 
     set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion>
-       Sets the dropout rates for any components of type DropoutComponent whose
+       Sets the dropout rates for any components of type DropoutComponent,
+       DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
     apply-svd name=<name-pattern> bottleneck-dim=<dim>
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 8b394477bef..ddf4e7b3fb6 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -166,7 +166,8 @@ void RnnlmCoreTrainer::Train(
   ProvideInput(minibatch, derived, word_embedding, &computer);
   computer.Run();  // This is the forward pass.
 
-  ProcessOutput(minibatch, derived, word_embedding,
+  bool is_backstitch_step1 = true;
+  ProcessOutput(is_backstitch_step1, minibatch, derived, word_embedding,
                 &computer, word_embedding_deriv);
 
   computer.Run();  // This is the backward pass.
@@ -192,6 +193,83 @@ void RnnlmCoreTrainer::Train(
   num_minibatches_processed_++;
 }
 
+void RnnlmCoreTrainer::TrainBackstitch(
+    bool is_backstitch_step1,
+    const RnnlmExample &minibatch,
+    const RnnlmExampleDerived &derived,
+    const CuMatrixBase<BaseFloat> &word_embedding,
+    CuMatrixBase<BaseFloat> *word_embedding_deriv) {
+  using namespace nnet3;
+
+  // backstitch training is incompatible with momentum > 0
+  KALDI_ASSERT(config_.momentum == 0.0);
+
+  bool need_model_derivative = true;
+  bool need_input_derivative = (word_embedding_deriv != NULL);
+  bool store_component_stats = true;
+
+  ComputationRequest request;
+  GetRnnlmComputationRequest(minibatch, need_model_derivative,
+                             need_input_derivative,
+                             store_component_stats,
+                             &request);
+
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  NnetComputeOptions compute_opts;
+
+  if (is_backstitch_step1) {
+    FreezeNaturalGradient(true, delta_nnet_);
+  }
+  ResetGenerators(nnet_);
+  NnetComputer computer(compute_opts, *computation,
+                        *nnet_, delta_nnet_);
+
+  ProvideInput(minibatch, derived, word_embedding, &computer);
+  computer.Run();  // This is the forward pass.
+
+  ProcessOutput(is_backstitch_step1, minibatch, derived, word_embedding,
+                &computer, word_embedding_deriv);
+
+  computer.Run();  // This is the backward pass.
+
+  if (word_embedding_deriv != NULL) {
+    CuMatrix<BaseFloat> input_deriv;
+    computer.GetOutputDestructive("input", &input_deriv);
+    word_embedding_deriv->AddSmatMat(1.0, derived.input_words_smat, kNoTrans,
+                                     input_deriv, 1.0);
+  }
+
+  BaseFloat max_change_scale, scale_adding;
+  if (is_backstitch_step1) {
+    // max-change is scaled by backstitch_training_scale;
+    // delta_nnet is scaled by -backstitch_training_scale when added to nnet;
+    max_change_scale = config_.backstitch_training_scale;
+    scale_adding = -config_.backstitch_training_scale;
+  } else {
+    // max-change is scaled by 1 + backstitch_training_scale;
+    // delta_nnet is scaled by 1 + backstitch_training_scale when added to nnet;
+    max_change_scale = 1.0 + config_.backstitch_training_scale;
+    scale_adding = 1.0 + config_.backstitch_training_scale;
+    num_minibatches_processed_++;
+    // If relevant, add in the part of the gradient that comes from L2
+    // regularization.
+    ApplyL2Regularization(*nnet_,
+                          1.0 / scale_adding *
+                          minibatch.num_chunks * config_.l2_regularize_factor,
+                          delta_nnet_);
+  }
+  
+  UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
+      max_change_scale, scale_adding, nnet_,
+      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+
+  ScaleNnet(0.0, delta_nnet_);
+
+  if (is_backstitch_step1) {
+    FreezeNaturalGradient(false, delta_nnet_);
+  }
+}
 
 void RnnlmCoreTrainer::ProvideInput(
     const RnnlmExample &minibatch,
@@ -230,12 +308,15 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
   }
   if (num_max_change_global_applied_ > 0)
     KALDI_LOG << "The global max-change was enforced "
-              << ((100.0 * num_max_change_global_applied_) /
-                  num_minibatches_processed_)
+              << (100.0 * num_max_change_global_applied_) /
+                 (num_minibatches_processed_ * 
+                 (config_.backstitch_training_scale == 0.0 ? 1.0 :
+                 1.0 + 1.0 / config_.backstitch_training_interval))
               << "\% of the time.";
 }
 
 void RnnlmCoreTrainer::ProcessOutput(
+    bool is_backstitch_step1,
     const RnnlmExample &minibatch,
     const RnnlmExampleDerived &derived,
     const CuMatrixBase<BaseFloat> &word_embedding,
@@ -257,7 +338,8 @@ void RnnlmCoreTrainer::ProcessOutput(
                      &weight, &objf_num, &objf_den,
                      &objf_den_exact);
 
-  objf_info_.AddStats(weight, objf_num, objf_den, objf_den_exact);
+  if (is_backstitch_step1)
+    objf_info_.AddStats(weight, objf_num, objf_den, objf_den_exact);
   computer->AcceptInput("output", &output_deriv);
 }
 
diff --git a/src/rnnlm/rnnlm-core-training.h b/src/rnnlm/rnnlm-core-training.h
index f7fb533a2d8..8f5ce873ff1 100644
--- a/src/rnnlm/rnnlm-core-training.h
+++ b/src/rnnlm/rnnlm-core-training.h
@@ -44,12 +44,16 @@ struct RnnlmCoreTrainerOptions {
   BaseFloat momentum;
   BaseFloat max_param_change;
   BaseFloat l2_regularize_factor;
+  BaseFloat backstitch_training_scale;
+  int32 backstitch_training_interval;
 
   RnnlmCoreTrainerOptions():
       print_interval(100),
       momentum(0.0),
       max_param_change(2.0),
-      l2_regularize_factor(1.0) { }
+      l2_regularize_factor(1.0),
+      backstitch_training_scale(0.0),
+      backstitch_training_interval(1) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("momentum", &momentum, "Momentum constant to apply during "
@@ -69,6 +73,14 @@ struct RnnlmCoreTrainerOptions {
                    "--l2-regularize-factor will be multiplied by the component-level "
                    "l2-regularize values and can be used to correct for effects "
                    "related to parallelization by model averaging.");
+    opts->Register("backstitch-training-scale", &backstitch_training_scale,
+                   "backstitch training factor. "
+                   "if 0 then in the normal training mode. It is referred to as "
+                   "'\\alpha' in our publications.");
+    opts->Register("backstitch-training-interval",
+                   &backstitch_training_interval,
+                   "do backstitch training with the specified interval of "
+                   "minibatches. It is referred to as 'n' in our publications.");
   }
 };
 
@@ -161,6 +173,15 @@ class RnnlmCoreTrainer {
              const CuMatrixBase<BaseFloat> &word_embedding,
              CuMatrixBase<BaseFloat> *word_embedding_deriv = NULL);
 
+  // The backstitch version of the above function. Depending
+  // on whether is_backstitch_step1 is true, It could be either the first
+  // (backward) step, or the second (forward) step of backstitch.
+  void TrainBackstitch(bool is_backstitch_step1,
+                       const RnnlmExample &minibatch,
+                       const RnnlmExampleDerived &derived,
+                       const CuMatrixBase<BaseFloat> &word_embedding,
+                       CuMatrixBase<BaseFloat> *word_embedding_deriv = NULL);
+
   // Prints out the final stats.
   void PrintTotalStats() const;
 
@@ -178,6 +199,7 @@ class RnnlmCoreTrainer {
 
   /** Process the output of the neural net and record the objective function
       in objf_info_.
+   @param [in] is_backstitch_step1  If true update stats otherwise not.
    @param [in] minibatch  The minibatch for which we're proessing the output.
    @param [in] derived  Derived quantities from the minibatch.
    @param [in] word_embedding  The word embedding, with the same numbering as
@@ -186,7 +208,8 @@ class RnnlmCoreTrainer {
                       w.r.t. the word-embedding that arises from the output
                       computation will be *added* to here.
   */
-  void ProcessOutput(const RnnlmExample &minibatch,
+  void ProcessOutput(bool is_backstitch_step1,
+                     const RnnlmExample &minibatch,
                      const RnnlmExampleDerived &derived,
                      const CuMatrixBase<BaseFloat> &word_embedding,
                      nnet3::NnetComputer *computer,
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index 0e45fe665b5..4c42bd4ab39 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -113,6 +113,54 @@ void RnnlmEmbeddingTrainer::Train(
   }
 }
 
+void RnnlmEmbeddingTrainer::TrainBackstitch(
+    bool is_backstitch_step1,
+    CuMatrixBase<BaseFloat> *embedding_deriv) {
+
+  // backstitch training is incompatible with momentum > 0  
+  KALDI_ASSERT(config_.momentum == 0.0);
+  
+  // If relevant, do the following:
+  // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
+  // This is an approximate to the regular l2 regularization (add l2 regularization
+  // to the objective function).
+  if (config_.l2_regularize > 0.0 && !is_backstitch_step1) {
+    BaseFloat l2_term = -2 * config_.l2_regularize;
+    if (l2_term != 0.0) {
+      embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
+          l2_term, *embedding_mat_);
+    }
+  } 
+
+  BaseFloat scale = 1.0;
+  if (config_.use_natural_gradient) {
+    if (is_backstitch_step1) preconditioner_.Freeze(true);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
+  }
+  scale *= config_.learning_rate;
+  num_minibatches_++;
+  if (config_.max_param_change > 0.0) {
+    BaseFloat delta = scale * embedding_deriv->FrobeniusNorm();
+    // 'delta' is the 2-norm of the change in parameters.
+    if (delta > config_.max_param_change) {
+      BaseFloat max_change_scale = config_.max_param_change / delta;
+      KALDI_LOG << "Applying max-change with scale " << max_change_scale
+                << " since param-change=" << delta << " > "
+                << " --embedding.max-param-change="
+                << config_.max_param_change;
+      max_change_count_++;
+      scale *= max_change_scale;
+    }
+  }
+  if (is_backstitch_step1) {
+    scale *= -config_.backstitch_training_scale;
+    if (config_.use_natural_gradient) preconditioner_.Freeze(false);
+  } else {
+    scale *= 1.0 + config_.backstitch_training_scale;
+    num_minibatches_++;
+  }
+  embedding_mat_->AddMat(scale, *embedding_deriv);
+}
 
 void RnnlmEmbeddingTrainer::Train(
     const CuArrayBase<int32> &active_words,
@@ -164,6 +212,56 @@ void RnnlmEmbeddingTrainer::Train(
   }
 }
 
+void RnnlmEmbeddingTrainer::TrainBackstitch(
+    bool is_backstitch_step1, 
+    const CuArrayBase<int32> &active_words,
+    CuMatrixBase<BaseFloat> *embedding_deriv) {
+
+  // backstitch training is incompatible with momentum > 0
+  KALDI_ASSERT(config_.momentum == 0.0);
+
+  KALDI_ASSERT(active_words.Dim() == embedding_deriv->NumRows());
+
+  // If relevant, do the following:
+  // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
+  // This is an approximate to the regular l2 regularization (add l2 regularization
+  // to the objective function).
+  if (config_.l2_regularize > 0.0 && !is_backstitch_step1) {
+    BaseFloat l2_term = -2 * config_.l2_regularize;
+    if (l2_term != 0.0) {
+      embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
+          l2_term, *embedding_mat_);
+    }
+  } 
+  BaseFloat scale = 1.0;
+  if (config_.use_natural_gradient) {
+    if (is_backstitch_step1) preconditioner_.Freeze(true);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
+  }
+  scale *= config_.learning_rate;
+  if (config_.max_param_change > 0.0) {
+    BaseFloat delta = scale * embedding_deriv->FrobeniusNorm();
+    // 'delta' is the 2-norm of the change in parameters.
+    if (delta > config_.max_param_change) {
+      BaseFloat max_change_scale = config_.max_param_change / delta;
+      KALDI_LOG << "Applying max-change with scale " << max_change_scale
+                << " since param-change=" << delta << " > "
+                << " --embedding.max-param-change="
+                << config_.max_param_change;
+      max_change_count_++;
+      scale *= max_change_scale;
+    }
+  }
+  if (is_backstitch_step1) {
+    scale *= -config_.backstitch_training_scale;
+    if (config_.use_natural_gradient) preconditioner_.Freeze(false);
+  } else {
+    scale *= 1.0 + config_.backstitch_training_scale;
+    num_minibatches_++;
+  }
+  embedding_deriv->AddToRows(scale, active_words, embedding_mat_);
+}
+
 RnnlmEmbeddingTrainer::~RnnlmEmbeddingTrainer() {
   PrintStats();
 }
@@ -171,7 +269,10 @@ RnnlmEmbeddingTrainer::~RnnlmEmbeddingTrainer() {
 void RnnlmEmbeddingTrainer::PrintStats() {
   KALDI_LOG << "Processed a total of " << num_minibatches_ << " minibatches."
             << "max-change was enforced "
-            << (100.0 * max_change_count_) / num_minibatches_
+            << (100.0 * max_change_count_) /
+               (num_minibatches_ *
+               (config_.backstitch_training_scale == 0.0 ? 1.0 :
+               1.0 + 1.0 / config_.backstitch_training_interval))
             << " \% of the time.";
 
   Matrix<BaseFloat> delta_embedding_mat(*embedding_mat_);
diff --git a/src/rnnlm/rnnlm-embedding-training.h b/src/rnnlm/rnnlm-embedding-training.h
index 7792084245d..8dc84eec1a6 100644
--- a/src/rnnlm/rnnlm-embedding-training.h
+++ b/src/rnnlm/rnnlm-embedding-training.h
@@ -49,6 +49,8 @@ struct RnnlmEmbeddingTrainerOptions {
                             // controlling the command line options to the
                             // training program (e.g. not providing a place to
                             // write the embedding matrix).
+  BaseFloat backstitch_training_scale;
+  int32 backstitch_training_interval;
 
   // Natural-gradient related options
   bool use_natural_gradient;
@@ -63,6 +65,8 @@ struct RnnlmEmbeddingTrainerOptions {
       max_param_change(1.0),
       l2_regularize(0.0),
       learning_rate(0.01),
+      backstitch_training_scale(0.0),
+      backstitch_training_interval(1),
       use_natural_gradient(true),
       natural_gradient_alpha(4.0),
       natural_gradient_rank(80),
@@ -86,6 +90,14 @@ struct RnnlmEmbeddingTrainerOptions {
                    "parameters.");
     opts->Register("learning-rate", &learning_rate, "The learning rate used in "
                    "training the word-embedding matrix.");
+    opts->Register("backstitch-training-scale", &backstitch_training_scale,
+                   "backstitch training factor. "
+                   "if 0 then in the normal training mode. It is referred to as "
+                   "'\\alpha' in our publications.");
+    opts->Register("backstitch-training-interval",
+                   &backstitch_training_interval,
+                   "do backstitch training with the specified interval of "
+                   "minibatches. It is referred to as 'n' in our publications.");
     opts->Register("use-natural-gradient", &use_natural_gradient,
                    "True if you want to use natural gradient to update the "
                    "embedding matrix");
@@ -138,6 +150,12 @@ class RnnlmEmbeddingTrainer {
   */
   void Train(CuMatrixBase<BaseFloat> *embedding_deriv);
 
+  // The backstitch version of the above function. Depending
+  // on whether is_backstitch_step1 is true, It could be either the first
+  // (backward) step, or the second (forward) step of backstitch.
+  void TrainBackstitch(bool is_backstitch_step1,
+                       CuMatrixBase<BaseFloat> *embedding_deriv);
+
 
   /* Train on one minibatch-- this version is for when there is subsampling, and
      the user is providing the derivative w.r.t. just the word-indexes that were
@@ -158,6 +176,10 @@ class RnnlmEmbeddingTrainer {
   void Train(const CuArrayBase<int32> &active_words,
              CuMatrixBase<BaseFloat> *word_embedding_deriv);
 
+  // The backstitch version of the above function.
+  void TrainBackstitch(bool is_backstitch_step1,
+                       const CuArrayBase<int32> &active_words,
+                       CuMatrixBase<BaseFloat> *word_embedding_deriv);
 
   ~RnnlmEmbeddingTrainer();
 
@@ -202,7 +224,6 @@ class RnnlmEmbeddingTrainer {
 
   // A count of the number of times the max-change constraint was applied.
   int32 max_change_count_;
-
 };
 
 
diff --git a/src/rnnlm/rnnlm-training.cc b/src/rnnlm/rnnlm-training.cc
index 90e520557a1..959906be2f2 100644
--- a/src/rnnlm/rnnlm-training.cc
+++ b/src/rnnlm/rnnlm-training.cc
@@ -44,7 +44,8 @@ RnnlmTrainer::RnnlmTrainer(bool train_embedding,
     num_minibatches_processed_(0),
     end_of_input_(false),
     previous_minibatch_empty_(1),
-    current_minibatch_empty_(1) {
+    current_minibatch_empty_(1),
+    srand_seed_(RandInt(0, 100000)) {
 
 
   int32 rnnlm_input_dim = rnnlm_->InputDim("input"),
@@ -182,7 +183,45 @@ void RnnlmTrainer::TrainWordEmbedding(
   }
 }
 
+void RnnlmTrainer::TrainBackstitchWordEmbedding(
+    bool is_backstitch_step1,
+    CuMatrixBase<BaseFloat> *word_embedding_deriv) {
+  RnnlmExample &minibatch = previous_minibatch_;
+  bool sampling = !minibatch.sampled_words.empty();
 
+  if (word_feature_mat_ == NULL) {
+    // There is no sparse word-feature matrix.
+    if (!sampling) {
+      embedding_trainer_->TrainBackstitch(is_backstitch_step1,
+                                          word_embedding_deriv);
+    } else {
+      embedding_trainer_->TrainBackstitch(is_backstitch_step1, active_words_,
+                                          word_embedding_deriv);
+    }
+  } else {
+    // There is a sparse word-feature matrix, so we need to multiply by it
+    // to get the derivative w.r.t. the feature-embedding matrix.
+
+    if (!sampling && word_feature_mat_transpose_.NumRows() == 0)
+      word_feature_mat_transpose_.CopyFromSmat(*word_feature_mat_, kTrans);
+
+    CuMatrix<BaseFloat> feature_embedding_deriv(embedding_mat_->NumRows(),
+                                                embedding_mat_->NumCols());
+    const CuSparseMatrix<BaseFloat> &word_features_trans =
+        (sampling ? active_word_features_trans_ : word_feature_mat_transpose_);
+
+    feature_embedding_deriv.AddSmatMat(1.0, word_features_trans, kNoTrans,
+                                       *word_embedding_deriv, 0.0);
+
+    // TODO: eventually remove these lines.
+    KALDI_VLOG(3) << "word-features-trans sum is " << word_features_trans.Sum()
+                  << ", word-embedding-deriv-sum is " << word_embedding_deriv->Sum()
+                  << ", feature-embedding-deriv-sum is " << feature_embedding_deriv.Sum();
+
+    embedding_trainer_->TrainBackstitch(is_backstitch_step1,
+                                        &feature_embedding_deriv);
+  }
+}
 
 
 void RnnlmTrainer::TrainInternal() {
@@ -195,10 +234,30 @@ void RnnlmTrainer::TrainInternal() {
     word_embedding_deriv.Resize(word_embedding->NumRows(),
                                 word_embedding->NumCols());
 
-  core_trainer_->Train(previous_minibatch_, derived_, *word_embedding,
-                       (train_embedding_ ? &word_embedding_deriv : NULL));
-  if (train_embedding_)
-    TrainWordEmbedding(&word_embedding_deriv);
+  if (core_config_.backstitch_training_scale > 0.0 &&
+      num_minibatches_processed_ % core_config_.backstitch_training_interval ==
+      srand_seed_ % core_config_.backstitch_training_interval) {
+    bool is_backstitch_step1 = true;
+    srand(srand_seed_ + num_minibatches_processed_);
+    core_trainer_->TrainBackstitch(is_backstitch_step1, previous_minibatch_,
+        derived_, *word_embedding,
+        (train_embedding_ ? &word_embedding_deriv : NULL));
+    if (train_embedding_)
+      TrainBackstitchWordEmbedding(is_backstitch_step1, &word_embedding_deriv);
+
+    is_backstitch_step1 = false;
+    srand(srand_seed_ + num_minibatches_processed_);
+    core_trainer_->TrainBackstitch(is_backstitch_step1, previous_minibatch_,
+        derived_, *word_embedding,
+        (train_embedding_ ? &word_embedding_deriv : NULL));
+    if (train_embedding_)
+      TrainBackstitchWordEmbedding(is_backstitch_step1, &word_embedding_deriv);
+  } else {
+    core_trainer_->Train(previous_minibatch_, derived_, *word_embedding,
+                         (train_embedding_ ? &word_embedding_deriv : NULL));
+    if (train_embedding_)
+      TrainWordEmbedding(&word_embedding_deriv);
+  }
 }
 
 int32 RnnlmTrainer::VocabSize() {
diff --git a/src/rnnlm/rnnlm-training.h b/src/rnnlm/rnnlm-training.h
index 3798bc199f3..e1eec79a3ff 100644
--- a/src/rnnlm/rnnlm-training.h
+++ b/src/rnnlm/rnnlm-training.h
@@ -124,6 +124,11 @@ class RnnlmTrainer {
   ///                       but this function consumes it destructively.
   void TrainWordEmbedding(CuMatrixBase<BaseFloat> *word_embedding_deriv);
 
+  /// The backstitch version of the above function.
+  void TrainBackstitchWordEmbedding(
+      bool is_backstitch_step1,
+      CuMatrixBase<BaseFloat> *word_embedding_deriv);
+
   /// This is the function-call that's run as the background thread which
   /// computes the derived parameters for each minibatch.
   void RunBackgroundThread();
@@ -243,6 +248,10 @@ class RnnlmTrainer {
   std::thread background_thread_;  // Background thread for computing 'derived'
                                    // parameters of a minibatch.
 
+  // This value is used in backstitch training when we need to ensure
+  // consistent dropout masks.  It's set to a value derived from rand()
+  // when the class is initialized.
+  int32 srand_seed_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(RnnlmTrainer);
 };
diff --git a/src/tfrnnlm/tensorflow-rnnlm.cc b/src/tfrnnlm/tensorflow-rnnlm.cc
index 4842d3fbaa8..3f11564c7c6 100644
--- a/src/tfrnnlm/tensorflow-rnnlm.cc
+++ b/src/tfrnnlm/tensorflow-rnnlm.cc
@@ -21,14 +21,16 @@
 #include <utility>
 #include <fstream>
 
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
-
 #include "tfrnnlm/tensorflow-rnnlm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
 
+// Tensorflow includes were moved after tfrnnlm/tensorflow-rnnlm.h include to
+// avoid macro redefinitions. See also the note in tfrnnlm/tensorflow-rnnlm.h.
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
 namespace kaldi {
 using std::ifstream;
 using tf_rnnlm::KaldiTfRnnlmWrapper;
@@ -316,7 +318,7 @@ void TfRnnlmDeterministicFst::Clear() {
   for (int i = 1; i < state_to_cell_.size(); i++) {
     delete state_to_cell_[i];
   }
-  
+
   state_to_context_.resize(1);
   state_to_cell_.resize(1);
   state_to_wseq_.resize(1);
diff --git a/src/tfrnnlm/tensorflow-rnnlm.h b/src/tfrnnlm/tensorflow-rnnlm.h
index 4c15229fe9d..90b68755964 100644
--- a/src/tfrnnlm/tensorflow-rnnlm.h
+++ b/src/tfrnnlm/tensorflow-rnnlm.h
@@ -1,4 +1,4 @@
-// tensorflow-rnnlm-lib.h
+// tensorflow-rnnlm.h
 
 // Copyright (C) 2017 Intellisist, Inc. (Author: Hainan Xu)
 
@@ -27,6 +27,32 @@
 #include "base/kaldi-common.h"
 #include "fstext/deterministic-fst.h"
 #include "util/common-utils.h"
+
+// Following macros are defined in both OpenFst and Tensorflow headers. Here we
+// undef them before including "tensorflow/core/public/session.h" to silence
+// compiler warnings. Note that this is not a panacea. We should still pay
+// attention to the order of includes in other places in the codebase to avoid
+// using the wrong macro definitions. Any OpenFst header or any header including
+// an OpenFst header should be included before tfrnnlm/tensorflow-rnnlm.h. Also,
+// to avoid macro redefinitions, any Tensorflow header should be included after
+// tfrnnlm/tensorflow-rnnlm.h.
+#undef LOG
+#undef VLOG
+#undef CHECK
+#undef CHECK_EQ
+#undef CHECK_LT
+#undef CHECK_GT
+#undef CHECK_LE
+#undef CHECK_GE
+#undef CHECK_NE
+#undef DCHECK
+#undef DCHECK_EQ
+#undef DCHECK_LT
+#undef DCHECK_GT
+#undef DCHECK_LE
+#undef DCHECK_GE
+#undef DCHECK_NE
+
 #include "tensorflow/core/public/session.h"
 
 using tensorflow::Session;
diff --git a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm-pruned.cc b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm-pruned.cc
index b707ca85977..ff9e3ce9581 100644
--- a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm-pruned.cc
+++ b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm-pruned.cc
@@ -20,12 +20,14 @@
 
 #include "base/kaldi-common.h"
 #include "fstext/fstext-lib.h"
-#include "tfrnnlm/tensorflow-rnnlm.h"
-#include "util/common-utils.h"
-#include "lm/const-arpa-lm.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
 #include "lat/compose-lattice-pruned.h"
+#include "lm/const-arpa-lm.h"
+#include "util/common-utils.h"
+
+// This should come after any OpenFst includes to avoid using the wrong macros.
+#include "tfrnnlm/tensorflow-rnnlm.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -139,7 +141,7 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0, n_fail = 0;
 
-    TfRnnlmDeterministicFst* lm_to_add_orig = 
+    TfRnnlmDeterministicFst* lm_to_add_orig =
       new TfRnnlmDeterministicFst(max_ngram_order, &rnnlm);
 
     for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
diff --git a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc
index 178674a3a8e..5d500b5651d 100644
--- a/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc
+++ b/src/tfrnnlmbin/lattice-lmrescore-tf-rnnlm.cc
@@ -22,9 +22,11 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 #include "lat/lattice-functions.h"
-#include "tfrnnlm/tensorflow-rnnlm.h"
 #include "util/common-utils.h"
 
+// This should come after any OpenFst includes to avoid using the wrong macros.
+#include "tfrnnlm/tensorflow-rnnlm.h"
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;