diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index cb5756188a4..cebb2b84f16 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1f.sh
\ No newline at end of file
+tuning/run_tdnn_1g.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
new file mode 100755
index 00000000000..e234b847aa7
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# 1g is as 1f but adding dropout (well, something like dropout-- the mask
+#   is shared across time and it's continuous rather than zero-one), increasing
+#   the hidden dimension, and training for more epochs.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp
+# System                tdnn1f_sp tdnn1g_sp
+#WER dev_clean_2 (tgsmall)      14.21     13.76
+#             [online:]         14.18     13.72
+#WER dev_clean_2 (tglarge)      10.32      9.65
+#             [online:]         10.25      9.85
+# Final train prob        -0.0507   -0.0453
+# Final valid prob        -0.0912   -0.0892
+# Final train prob (xent)   -1.3550   -1.1694
+# Final valid prob (xent)   -1.6018   -1.4486
+# Num-params                 4205322   6227338
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{f,g}_sp
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 1) xent:train/valid[10,16,final]=(-1.61,-1.41,-1.36/-1.82,-1.66,-1.60) logprob:train/valid[10,16,final]=(-0.067,-0.057,-0.051/-0.106,-0.097,-0.091)
+# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2309 combine=-0.054->-0.053 (over 2) xent:train/valid[15,24,final]=(-1.49,-1.22,-1.17/-1.75,-1.51,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.050,-0.045/-0.106,-0.096,-0.089)
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1g   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=15 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
new file mode 100755
index 00000000000..0fa7353edb2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -0,0 +1,608 @@
+#!/bin/bash
+
+# 7m25l is as 7m25j but with no dropout on the prefinal layer.  Hoping to resolve
+# bad objf in middle of training.
+# Caution: in 7m25l2 there is a run which by mistake, did have dropout on the
+# prefinal layer, and which should for the most part be just a rerun of 7m25j.
+
+# This seems *maybe* slightly better than j and l2 (note: l2 is like j).
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     11.95     11.98     11.90
+# WER on train_dev(fg)      11.12     11.08     11.00     10.92
+# WER on eval2000(tg)        14.9      14.6      14.7      14.7
+# WER on eval2000(fg)        13.5      13.3      13.3      13.3
+# WER on rt03(tg)            18.4      18.1      18.1      18.0
+# WER on rt03(fg)            16.2      15.8      15.8      15.7
+# Final train prob         -0.077    -0.078    -0.078    -0.076
+# Final valid prob         -0.093    -0.091    -0.091    -0.091
+# Final train prob (xent)        -0.994    -0.987    -0.987    -0.973
+# Final valid prob (xent)       -1.0194   -1.0161   -1.0142   -1.0041
+# Num-parameters               20111396  22735140  22735140  22735140
+
+#
+# But I may have changed the training code to accept more models in averaging,
+# so that could be responsible for some of the change.
+#
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     12.13     11.98
+# WER on train_dev(fg)      11.12     11.22     11.00
+# WER on eval2000(tg)        14.9      15.0      14.7
+# WER on eval2000(fg)        13.5      13.7      13.3
+# WER on rt03(tg)            18.4      18.2      18.1
+# WER on rt03(fg)            16.2      15.7      15.8
+# Final train prob         -0.077    -0.078    -0.078
+# Final valid prob         -0.093    -0.092    -0.091
+# Final train prob (xent)        -0.994    -0.996    -0.987
+# Final valid prob (xent)       -1.0194   -1.0214   -1.0142
+# Num-parameters               20111396  22735140  22735140
+
+# 7m25j is as 7m25i but with the dropout schedule peaking at 0.5 not 0.3,
+#   and with 8 instead of 6 epochs (like g->h).
+#   This run failed due to instability.
+
+# 7m25i is as 7m25g but with dropout-per-dim-continuous=true.
+#
+# 7m25g is as 7m25f but with dim=1536 for the subsampled layers (more like 7m25d than 7m25e).
+
+# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments.
+#
+# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25l
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index be8d39de80b..e3d13ac1f65 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -4,7 +4,7 @@
 # end, and no chain l2-regularize
 #[note: was 1e12e.]
 
-# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp
+# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
 # System                tdnn1e10_sp tdnn1e12e_sp
 #WER dev93 (tgpr)                7.29      7.20
 #WER dev93 (tg)                  7.08      6.81
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..eda1461a2ab 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,9 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False, # if you set this, it's
+                                                    # like dropout-per-dim but with a
+                                                    # continuous-valued (not zero-one) mask.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -864,32 +867,19 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                if not self.config['dropout-per-dim']:
+                if not (self.config['dropout-per-dim'] or
+                        self.config['dropout-per-dim-continuous']):
                     line = ('component name={0}.{1} type=DropoutComponent '
                             'dim={2} dropout-proportion={3}'.format(
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
-                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
-                    configs.append(line)
-                    # note: the input to the dropout_mask component is never used, it's
-                    # just syntactically required.
-                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
-                            'input={1}'.format(self.name, cur_node))
-                    configs.append(line)
-                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
-                            'input-dim={1} output-dim={2} '.format(
-                                self.name, 2 * output_dim, output_dim))
-                    configs.append(line)
-                    line = ('component-node name={0}.dropout component={0}.dropout '
-                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
-                            ''.format(self.name, cur_node))
-                    configs.append(line)
-                    cur_node = '{0}.dropout'.format(self.name)
-                    continue
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
 
+                    line = ('component name={0}.dropout type=GeneralDropoutComponent '
+                            'dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"
                                    .format(nonlinearity))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 87edd661a6f..6b8b1834749 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -221,7 +221,8 @@ def process_args(args):
     if (not os.path.exists(args.dir+"/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist.")
+                        "and exist; or the {0}/configs directory should exist."
+                        "".format(args.dir))
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 620ea873eb7..c936061de26 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -33,10 +33,9 @@ DenominatorComputation::DenominatorComputation(
     den_graph_(den_graph),
     num_sequences_(num_sequences),
     frames_per_sequence_(nnet_output.NumRows() / num_sequences_),
-    exp_nnet_output_transposed_(nnet_output, kTrans),
     nnet_output_deriv_transposed_(
-        exp_nnet_output_transposed_.NumRows(),
-        std::min<int32>(exp_nnet_output_transposed_.NumCols(),
+        nnet_output.NumCols(),
+        std::min<int32>(nnet_output.NumRows(),
                         static_cast<int32>(kMaxDerivTimeSteps) *
                         num_sequences_)),
     alpha_(frames_per_sequence_ + 1,
@@ -57,6 +56,14 @@ DenominatorComputation::DenominatorComputation(
                  num_sequences_).SetZero();
 
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
+  // the kStrideEqualNumCols argument means we'll allocate a contiguous block of
+  // memory for this; it is added to ensure that the same block of memory
+  // (cached in the allocator) can be used for xent_output_deriv when allocated
+  // from chain-training.cc.
+  exp_nnet_output_transposed_.Resize(nnet_output.NumCols(),
+                                     nnet_output.NumRows(),
+                                     kUndefined, kStrideEqualNumCols);
+  exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans);
   exp_nnet_output_transposed_.ApplyExp();
 }
 
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index bf61bed67f0..f4b0d110373 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -52,8 +52,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
-  if (xent_output_deriv != NULL)
-    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+  if (xent_output_deriv != NULL) {
+    // the reason for kStrideEqualNumCols is so that we can share the memory
+    // block with the memory that was used for exp_nnet_output_transposed_ from
+    // chain-denominator.cc, which has just been freed; it also uses the
+    // kStrideEqualNumCols arg (its shape is the transpose of this matrix's
+    // shape).
+    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                              kSetZero, kStrideEqualNumCols);
+  }
 
 
   {
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 8ab03c7e14e..f2926ddc2f1 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -143,6 +143,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
 void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride);
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride);
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim);
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ae7e25b716d..50dd3d1d0ca 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1984,6 +1984,23 @@ static void _add_rows(Real alpha, Real* dst, const Real *src,
   }
 }
 
+template<typename Real>
+__global__
+static void _mul_rows(Real* dst, const Real *src,
+                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                      int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    if (reorder[j] >= 0) {
+      int src_index = reorder[j] * src_stride + i;
+      dst[dst_index] *= src[src_index];
+    }
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _add_rows(Real alpha, Real* dst, const Real * const *src,
@@ -3764,6 +3781,12 @@ void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
                            const float* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
@@ -4454,6 +4477,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 3518e0c71ed..fe706815a44 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -221,20 +221,30 @@ inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* const * src, MatrixDim dst_dim) {
   cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* const * src, MatrixDim dst_dim) {
+  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
+}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* const * src, MatrixDim dst_dim) {
-  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
-}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
                           const float* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                          const double* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, float* dst,
+                          const float* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
                           double alpha, const int* smat_row_ptr,
                           const int* smat_col_idx, const double* smat_val) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 909e5552a35..33db8b3e625 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -534,6 +534,42 @@ static void UnitTestCuMatrixAddRows() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixMulRows() {
+  for (int32 p = 0; p < 2; p++) {
+    MatrixIndexT num_rows1 = 10 + Rand() % 10,
+        num_rows2 = 10 + Rand() % 10,
+        num_cols = 10 + Rand() % 10;
+    CuMatrix<Real> M(num_rows1, num_cols);
+    M.SetRandn();
+
+    CuMatrix<Real> N1(num_rows2, num_cols),
+        O(num_rows2, num_cols);
+    std::vector<int32> reorder(num_rows2);
+    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    for (int32 i = 0; i < num_rows2; i++) {
+      reorder[i] = -1 + (Rand() % (num_rows1 + 1));
+      if (reorder[i] != -1)
+        reorder_src[i] = M.RowData(reorder[i]);
+    }
+
+    CuArray<int32> reorder_cuda(reorder);
+    N1.MulRows(M, reorder_cuda);
+
+    for (int32 i = 0; i < num_rows2; i++) {
+      if (reorder[i] != -1) {
+        CuSubVector<Real> O_row(O, i),
+            M_row(M, reorder[i]);
+        O_row.MulElements(M_row);
+      }
+    }
+
+    AssertEqual(N1, O);
+  }
+}
+
+
+
 template<typename Real>
 static void UnitTestCuMatrixAddToRows() {
   for (int32 p = 0; p < 2; p++) {
@@ -2914,6 +2950,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixCopyColsFromVec<Real>();
   UnitTestCuMatrixCopyToRows<Real>();
   UnitTestCuMatrixAddRows<Real>();
+  UnitTestCuMatrixMulRows<Real>();
   UnitTestCuMatrixAddToRows<Real>();
   UnitTestCuMatrixAddRowRanges<Real>();
   UnitTestCuMatrixAddTpMat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 813c5e75d14..34290561cc5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2722,6 +2722,41 @@ void CuMatrixBase<Real>::AddRows(Real alpha,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::MulRows(const CuMatrixBase<Real> &src,
+                                 const CuArrayBase<MatrixIndexT> &indexes) {
+  if (NumRows() == 0) return;
+  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(src.NumCols() == NumCols());
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_mul_rows(dimGrid, dimBlock,
+                  data_, src.Data(), indexes.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    MatrixBase<Real> &this_mat(Mat());
+    const MatrixBase<Real> &src_mat(src.Mat());
+    int32 num_rows = NumRows();
+    const MatrixIndexT *index_ptr = indexes.Data();
+    for (int32 r = 0; r < num_rows; r++) {
+      int32 src_r = index_ptr[r];
+      if (src_r < 0)
+        continue;
+      SubVector<Real> this_row(this_mat, r),
+          src_row(src_mat, src_r);
+      this_row.MulElements(src_row);
+    }
+  }
+}
+
+
 
 template<typename Real>
 void CuMatrixBase<Real>::AddRows(Real alpha, const CuArrayBase<const Real*> &src) {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 7c3a2a2e11f..86c50cfc485 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -139,6 +139,15 @@ class CuMatrixBase {
                const CuMatrixBase<Real> &src,
                const CuArrayBase<MatrixIndexT> &indexes);
 
+
+  /// Does for each row r, this.Row(r) *= alpha * src.row(indexes[r]),
+  /// where '*=' is elementwise multiplication.
+  /// If indexes[r] < 0, does not add anything.
+  /// src.NumCols() must equal this.NumCols()
+  void MulRows(const CuMatrixBase<Real> &src,
+               const CuArrayBase<MatrixIndexT> &indexes);
+
+
   /// Does for each row r, this.Row(r) += alpha * src[r],
   /// treating src[r] as the beginning of a region of memory representing
   /// a vector of floats, of the same length as this.NumCols().
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2080c60077b..844fb82d32a 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -92,18 +92,101 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   num_minibatches_processed_++;
 }
 
+// This object exists to help avoid memory fragmentation: it allocates,
+// but does not use, the exact sizes of memory that are going to be needed
+// in ComputeChainObjfAndDeriv().
+class ChainTrainerMemoryHolder {
+ public:
+  ChainTrainerMemoryHolder(const Nnet &nnet,
+                           int32 num_den_graph_states,
+                           const NnetChainExample &eg);
+ private:
+  CuMatrix<BaseFloat> nnet_output_deriv_;
+  CuMatrix<BaseFloat> xent_output_deriv_;
+  CuMatrix<BaseFloat> beta_;
+  CuMatrix<BaseFloat> alpha_;
+
+};
+
+ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet,
+                                                   int32 den_graph_states,
+                                                   const NnetChainExample &eg) {
+
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+
+  int32 max_rows = 0,
+      max_cols = 0;
+
+  size_t max_frames_per_sequence = 0,
+         max_sequence_size = 0,
+         max_alpha_matrix_size = 0;
+
+  for (; iter != end; ++iter) {
+    // there will normally be just one of these things; we'll normally loop once.
+    const NnetChainSupervision &sup = *iter;
+
+    int32 output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
+    int32 output_cols = nnet.OutputDim("output");
+
+    size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
+    size_t den_graph_size = den_graph_states + 1;
+    size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
+    size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;
+
+    if (curr_alpha_matrix_size > max_alpha_matrix_size) {
+      max_alpha_matrix_size = curr_alpha_matrix_size;
+      max_frames_per_sequence = curr_frames_per_sequence;
+      max_sequence_size = curr_sequence_size;
+    }
+
+    size_t matrix_size = output_rows * output_cols;
+    if (matrix_size > (max_rows * max_cols)) {
+      max_rows = output_rows;
+      max_cols = output_cols;
+    }
+  }
+
+  // the sequence of resizes is in a specific order (bigger to smaller)
+  // so that the cudaMalloc won't trash the memory it has already
+  // alloc'd in the previous iterations
+  alpha_.Resize(max_frames_per_sequence,
+                max_sequence_size,
+                kUndefined);
+
+
+  nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);
+  // note: the same block of memory can be used for xent_output_deriv_ as is
+  // used for exp_nnet_output_transposed_ in chain-training.cc.
+  xent_output_deriv_.Resize(max_rows, max_cols,
+                            kUndefined, kStrideEqualNumCols);
+
+  beta_.Resize(2, max_sequence_size, kUndefined);
+}
+
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
+
+  // reserve the memory needed in ProcessOutputs (before memory gets fragmented
+  // by the call to computer.Run().
+  ChainTrainerMemoryHolder *memory_holder =
+      new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg);
+
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
 
+  // 'this->ProcessOutputs()' is going to need the same sizes as are stored in
+  // 'memory_holder'.
+  delete memory_holder;
+
+  // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
@@ -140,7 +223,7 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index ce4bbd0940a..c73f3fb921d 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -66,6 +66,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new TimeHeightConvolutionComponent::PrecomputedIndexes();
   } else if (cpi_type == "RestrictedAttentionComponentPrecomputedIndexes") {
     ans = new RestrictedAttentionComponent::PrecomputedIndexes();
+  } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") {
+    ans = new GeneralDropoutComponentPrecomputedIndexes();
   }
   if (ans != NULL) {
     KALDI_ASSERT(cpi_type == ans->Type());
@@ -158,6 +160,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutComponent();
   } else if (component_type == "DropoutMaskComponent") {
     ans = new DropoutMaskComponent();
+  } else if (component_type == "GeneralDropoutComponent") {
+    ans = new GeneralDropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index c34d550d681..79a1f1a5602 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -588,7 +588,7 @@ class UpdatableComponent: public Component {
                      self-repair mechanism is activated.  -1000 is a special value which
                      will cause a component-specific default to be used.
 
-       block-dim     Defaults to dim, but may be any nonzero divisor of dim.  It affects the
+       block-dim     Defaults to dim, but may be any divisor of dim.  It affects the
                      self-repair, which will be done while treating the input/output as
                      repeating blocks of size 'block-dim' (e.g. blocks of filters).  It allows
                      us to do self-repair on the filter level in CNNs.
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index d7595378c1f..39bd156e360 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -41,7 +41,7 @@ bool CheckStringsApproxEqual(const std::string &a,
                              int32 tolerance = 3) {
   if (!StringsApproxEqual(a, b, tolerance)) {
     KALDI_WARN << "Strings differ: " << a
-               << "\vs.\n" << b;
+               << "\nvs.\n" << b;
     return false;
   } else {
     return true;
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 98aed592a62..bb0e7c917fc 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -636,8 +636,8 @@ static void PrintCommand(std::ostream &os_out,
         KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16);
         compressed_matrix_type = "uint16";
       }
-      os << "CompressMatrix(" << submatrix_strings[c.arg1]
-         << ", " << range << ", " << compressed_matrix_type << ", "
+      os << "CompressMatrix(" << submatrix_strings[c.arg1] << ", "
+         << range << ", " << compressed_matrix_type << ", "
          << truncate << ")\n";
       break;
     }
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 19eecdda72b..cae6f41f5f2 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -399,8 +399,8 @@ void NnetComputer::ExecuteCommand() {
           compressed_matrices_[m]->CopyFromMat(matrices_[m]);
           matrices_[m].Resize(0, 0);
         }
-        break;
 #endif
+        break;
       case kDecompressMatrix:
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..2720fbbd0bd 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,11 +1507,280 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
 
 
+std::string GeneralDropoutComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", dim=" << dim_
+         << ", block-dim=" << block_dim_
+         << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
+  if (time_period_ > 0)
+    stream << ", time-period=" << time_period_;
+  return stream.str();
+}
+
+GeneralDropoutComponent::GeneralDropoutComponent():
+    dim_(-1), block_dim_(-1), time_period_(0),
+    dropout_proportion_(0.5), continuous_(false) { }
+
+GeneralDropoutComponent::GeneralDropoutComponent(
+    const GeneralDropoutComponent &other):
+    dim_(other.dim_),
+    block_dim_(other.block_dim_),
+    time_period_(other.time_period_),
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
+
+void* GeneralDropoutComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+
+  KALDI_ASSERT(SameDim(in, *out));
+
+  // The following will do nothing if 'out' and 'in' refer to the same data.
+  out->CopyFromMat(in);
+
+  if (test_mode_ || dropout_proportion_ == 0.0)
+    return NULL;
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL);
+
+  CuMatrix<BaseFloat> *mask = GetMemo(indexes->num_mask_rows);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(out->Stride() == out->NumCols());
+    int32 num_rows = out->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> out_reshaped(out->Data(), block_dim_,
+                                        num_rows_reshaped,
+                                        num_rows_reshaped);
+    out_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    out->MulRows(*mask, indexes->indexes);
+  }
+  return mask;
+}
+
+void GeneralDropoutComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv));
+
+  // The following will do no work if in_deriv->Data() == out_deriv.Data().
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || dropout_proportion_ == 0.0) {
+    KALDI_ASSERT(memo == NULL);
+    return;
+  }
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+     dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && memo != NULL);
+  CuMatrix<BaseFloat> *mask = reinterpret_cast<CuMatrix<BaseFloat>*>(memo);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(in_deriv->Stride() == in_deriv->NumCols());
+    int32 num_rows = in_deriv->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(), block_dim_,
+                                             num_rows_reshaped,
+                                             num_rows_reshaped);
+    in_deriv_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    in_deriv->MulRows(*mask, indexes->indexes);
+  }
+}
+
+void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<GeneralDropoutComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<TimePeriod>");
+  ReadBasicType(is, binary, &time_period_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
+    test_mode_ = true;
+  } else {
+    test_mode_ = false;
+  }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</GeneralDropoutComponent>");
+}
+
+
+void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<GeneralDropoutComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<TimePeriod>");
+  WriteBasicType(os, binary, time_period_);
+  WriteToken(os, binary, "<DropoutProportion>");
+  WriteBasicType(os, binary, dropout_proportion_);
+  if (test_mode_)
+    WriteToken(os, binary, "<TestMode>");
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
+  WriteToken(os, binary, "</GeneralDropoutComponent>");
+}
+
+Component* GeneralDropoutComponent::Copy() const {
+  return new GeneralDropoutComponent(*this);
+}
+
+void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = 0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  KALDI_ASSERT(ok && dim_ > 0);
+  block_dim_ = dim_;
+  cfl->GetValue("block-dim", &block_dim_);
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0))
+    KALDI_ERR << "Invalid configuration dim=" << dim_
+              << ", block-dim=" << block_dim_;
+  time_period_ = 0;
+  cfl->GetValue("time-period", &time_period_);
+  dropout_proportion_ = 0.5;
+  cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
+  test_mode_ = false;
+  cfl->GetValue("test-mode", &test_mode_);
+}
+
+
+CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
+    int32 num_mask_rows) const {
+  KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ &&
+               dropout_proportion_ > 0.0);
+  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_);
+  BaseFloat dropout_proportion = dropout_proportion_;
+
+  // This const_cast is only safe assuming you don't attempt
+  // to use multi-threaded code with the GPU.
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(ans);
+
+  if (!continuous_) {
+    ans->Add(-dropout_proportion);
+    // now, a proportion "dropout_proportion" will be < 0.0. After applying the
+    // function (x>0?1:0), a proportion "dropout_proportion" will be zero and (1 -
+    // dropout_proportion) will be 1.0.
+    ans->ApplyHeaviside();
+    ans->Scale(1.0 / dropout_proportion);
+  } else {
+    ans->Scale(dropout_proportion * 4.0);
+    // make the expected value 1.0.
+    ans->Add(1.0 - (2.0 * dropout_proportion));
+  }
+  return ans;
+}
+
+ComponentPrecomputedIndexes* GeneralDropoutComponent::PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const {
+  KALDI_ASSERT(input_indexes == output_indexes);
+
+  GeneralDropoutComponentPrecomputedIndexes *ans = new
+      GeneralDropoutComponentPrecomputedIndexes;
+  int32 size = input_indexes.size(), time_period = time_period_,
+      cur_row = 0;
+  std::vector<int32> indexes(size);
+  // the map 'm' will map from a pair from (n, t) value to the row-index of the
+  // dropout-mask matrix*.   However, the 't' isn't a real 't' value;
+  // if time_period_ == 0, the 't' value will just be zero; otherwise,
+  // it will be t divided by time_period_ (rounding towards negative infinity).
+
+  // *before considering effects related to when block_dim_ != dim_.
+
+  std::unordered_map<std::pair<int32,int32>, int32, PairHasher<int32> > m;
+  for (int32 i = 0; i < size; i++) {
+    int32 n = input_indexes[i].n,
+        t = (time_period == 0 ? 0 : DivideRoundingDown(input_indexes[i].t,
+                                                       time_period));
+    std::pair<int32, int32> p(n, t);
+
+    std::unordered_map<std::pair<int32,int32>, int32,
+                       PairHasher<int32> >::const_iterator
+        iter = m.find(p);
+    if (iter != m.end()) {
+      indexes[i] = iter->second;
+    } else {
+      m[p] = cur_row;
+      indexes[i] = cur_row;
+      cur_row++;
+    }
+  }
+  int32 multiple = dim_ / block_dim_;
+  ans->num_mask_rows = cur_row;
+  if (multiple == 1) {
+    ans->indexes.CopyFromVec(indexes);
+  } else {
+    ans->num_mask_rows = cur_row * multiple;
+    std::vector<int32> repeated_indexes;
+    repeated_indexes.reserve(size * multiple);
+    for (int32 i = 0; i < size; i++) {
+      int32 row = indexes[i];
+      for (int32 j = 0; j < multiple; j++)
+        repeated_indexes.push_back(row);
+    }
+    ans->indexes.CopyFromVec(repeated_indexes);
+  }
+  return ans;
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Write(std::ostream &os,
+    bool binary) const {
+  WriteToken(os, binary,
+             "<GeneralDropoutComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<NumMaskRows>");
+  WriteBasicType(os, binary, num_mask_rows);
+  WriteToken(os, binary, "<Indexes>");
+  indexes.Write(os, binary);
+  WriteToken(os, binary,
+             "</GeneralDropoutComponentPrecomputedIndexes>");
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is,
+    bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<GeneralDropoutComponentPrecomputedIndexes>",
+                       "<NumMaskRows>");
+  ReadBasicType(is, binary, &num_mask_rows);
+  ExpectToken(is, binary, "<Indexes>");
+  indexes.Read(is, binary);
+  ExpectToken(is, binary,
+              "</GeneralDropoutComponentPrecomputedIndexes>");
+}
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..cff73a55b59 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,12 +776,182 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
 
 
 
+/**
+   GeneralDropoutComponent implements dropout, including a continuous
+   variant where the thing we multiply is not just zero or one, but may
+   be a continuous value.  It is intended for the case where you want to
+   either share the dropout mask across all of time, or across groups
+   of 't' values (e.g. the first block of 10 values gets one dropout
+   mask, the second block of 10 gets another one, and so on).
+
+
+   Configuration values accepted on the command line, with defaults:
+
+       dim        Dimension of the input and output of this component,
+                  e.g. 512
+
+       block-dim  Block size if you want the dropout mask to repeat,
+                  e.g. if dim=512 and you sent block-dim=128, there will
+                  be a mask of dimension 128 repeated 4 times.  This can
+                  be useful in convolutional setups.  If not specified,
+                  block-dim defaults to 'dim'; if specified, it must be
+                  a divisor of 'dim'.
+
+       dropout-proportion=0.5   For conventional dropout, this is the proportion
+                  of mask values that (in expectation) are zero; it would
+                  normally be between 0 and 0.5.  The nonzero mask values
+                  will be given values 1.0 / dropout_proportion, so that the
+                  expected value is 1.0.  This behavior is different from
+                  DropoutComponent and DropoutMaskComponent.
+
+                  For continuous dropout (continuous==true), the dropout scales
+                  will have values (1.0 + 2 * dropout-proportion *
+                  Uniform[-1,1]).  This might seem like a strange choice, but it
+                  means that dropout-proportion=0.5 gives us a kind of
+                  'extremal' case where the dropout scales are distributed as
+                  Uniform[0, 2] and we can pass in the dropout scale as if it
+                  were a conventional dropout scale.
+
+       time-period=0   This determines how the dropout mask interacts
+                  with the time index (t).  In all cases, different sequences
+                  (different 'n' values) get different dropout masks.
+                  If time-period==0, then the dropout mask is shared across
+                  all time values.  If you set time-period > 0, then the
+                  dropout mask is shared across blocks of time values: for
+                  instance if time-period==10, then we'll use one dropout
+                  mask for t values 0 through 9, another for 10 through 19,
+                  and so on.  In all cases, the dropout mask will be shared
+                  across all 'x' values, although in most setups the x values
+                  are just zero so this isn't very interesting.
+                  If you set time-period==1 it would be similar to regular
+                  dropout, and it would probably make more sense to just use the
+                  normal DropoutComponent.
+
+ */
+class GeneralDropoutComponent: public RandomComponent {
+ public:
+  virtual int32 InputDim() const { return dim_; }
+
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  GeneralDropoutComponent();
+
+  GeneralDropoutComponent(const GeneralDropoutComponent &other);
+
+  virtual std::string Type() const { return "GeneralDropoutComponent"; }
+  virtual int32 Properties() const {
+    return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo|
+        (block_dim_ != dim_ ? (kInputContiguous|kOutputContiguous) : 0);
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void DeleteMemo(void *memo) const {
+    delete static_cast<CuMatrix<BaseFloat>*>(memo);
+  }
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; }
+
+ private:
+
+  // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'.  This
+  // should not be called if test_mode_ is true or dropout_proportion_ is zero.
+  CuMatrix<BaseFloat> *GetMemo(int32 num_mask_rows) const;
+
+
+  // The input and output dimension
+  int32 dim_;
+
+  // block_dim_ must divide dim_.
+  int32 block_dim_;
+
+  // time_period_ can be zero if we want all 't' values to share the same
+  // dropout mask, and a value more than zero if we want blocks of 't' values to
+  // share the dropout mask.  For example, if time_period_ is 10, blocks of size
+  // 10 frames will share the same dropout mask.
+  int32 time_period_;
+
+  BaseFloat dropout_proportion_;
+
+  bool continuous_;
+
+  bool test_mode_;
+
+  const GeneralDropoutComponent &operator
+  = (const GeneralDropoutComponent &other); // Disallow.
+};
+
+// This stores some precomputed indexes for GeneralDropoutComponent.
+// This object is created for every instance of the Propagate()
+// function in the compiled computation.
+class GeneralDropoutComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+
+  // num_mask_rows is the number of rows in the dropout-mask matrix;
+  // it's num-cols is the block_dim_ of the component.
+  int32 num_mask_rows;
+
+  // 'indexes' is of dimension (the number of rows in the matrix we're doing
+  // Propagate() or Backprop() on) times the (dim_ / block_dim_) of the
+  // GeneralDropoutComponent.  Each value is in the range [0, num_mask_rows-1],
+  // and each value is repeated (dim_ / block_dim_) times.  This array is used
+  // to multiply the reshaped values or derivatives by the appropriate rows of
+  // the dropout matrix.
+  CuArray<int32> indexes;
+
+  virtual ~GeneralDropoutComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new GeneralDropoutComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const {
+    return "GeneralDropoutComponentPrecomputedIndexes";
+  }
+};
+
+
+
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 1806fe38493..37ad624d0f0 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -58,7 +58,7 @@ namespace nnet3 {
    Configuration values accepted:
       dim, or input-dim    Input dimension of this component, e.g. 1024.
                            Will be the same as the output dimension if add-log-stddev=false.
-      block-dim            Defaults to 'dim' you may specify a nonzero divisor
+      block-dim            Defaults to 'dim' you may specify a divisor
                            of 'dim'.  In this case the input dimension will
                            be interpreted as blocks of dimension 'block-dim'
                            to which the nonlinearity described above is applied
@@ -144,11 +144,11 @@ class NormalizeComponent: public Component {
 
     Accepted configuration values:
            dim          Dimension of the input and output
-           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+           block-dim    Defaults to 'dim', but may be set to a divisor
                         of 'dim'.  In this case, each block of dimension 'block-dim'
                         is treated like a separate row of the input matrix, which
                         means that the stats from n'th element of each
-                        block are pooled into one class, for each n.a
+                        block are pooled into one class, for each n.
            epsilon      Small term added to the variance that is used to prevent
                         division by zero
            target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 6bff30c501b..812b66c41b1 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -90,7 +90,7 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                                 const NnetComputation &computation) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
@@ -131,7 +131,7 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
                                           bool is_backstitch_step1) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index fd2229cace8..afe624f94ca 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -486,6 +486,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
         dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
     if (mc != NULL)
       mc->SetDropoutProportion(dropout_proportion);
+    GeneralDropoutComponent *gdc =
+        dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
+    if (gdc != NULL)
+      gdc->SetDropoutProportion(dropout_proportion);
   }
 }
 
@@ -1172,12 +1176,17 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c));
           DropoutMaskComponent *mask_component =
              dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
+          GeneralDropoutComponent *general_dropout_component =
+             dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
           if (dropout_component != NULL) {
             dropout_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
           } else if (mask_component != NULL){
             mask_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
+          } else if (general_dropout_component != NULL){
+            general_dropout_component->SetDropoutProportion(proportion);
+            num_dropout_proportions_set++;
           }
         }
       }
@@ -1461,9 +1470,10 @@ class ModelCollapser {
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
-     the case where 'component_index1' is of type DropoutComponent, and where
-     'component_index2' is of type AffineComponent,
-     NaturalGradientAffineComponent or TimeHeightConvolutionComponent.
+     the case where 'component_index1' is of type DropoutComponent or
+     GeneralDropoutComponent, and where 'component_index2' is of type
+     AffineComponent, NaturalGradientAffineComponent or
+     TimeHeightConvolutionComponent.
 
      Returns -1 if this code can't produce a combined component (normally
      because the components have the wrong types).
@@ -1473,10 +1483,23 @@ class ModelCollapser {
     const DropoutComponent *dropout_component =
         dynamic_cast<const DropoutComponent*>(
             nnet_->GetComponent(component_index1));
-    if (dropout_component == NULL)
+    const GeneralDropoutComponent *general_dropout_component =
+        dynamic_cast<const GeneralDropoutComponent*>(
+            nnet_->GetComponent(component_index1));
+
+    if (dropout_component == NULL && general_dropout_component == NULL)
       return -1;
-    BaseFloat dropout_proportion = dropout_component->DropoutProportion();
-    BaseFloat scale = 1.0 / (1.0 - dropout_proportion);
+    BaseFloat scale;  // the scale we have to apply to correct for removing
+                      // this dropout comonent.
+    if (dropout_component != NULL) {
+      BaseFloat dropout_proportion = dropout_component->DropoutProportion();
+      scale = 1.0 / (1.0 - dropout_proportion);
+    } else {
+      // for GeneralDropoutComponent, it's done in such a way that the expectation
+      // is always 1.  (When it's nonzero, we give it a value 1/(1-dropout_proportion).
+      // So no scaling is needed.
+      scale = 1.0;
+    }
     // note: if the 2nd component is not of a type that we can scale, the
     // following function call will return -1, which is OK.
     return GetScaledComponentIndex(component_index2,
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index efa36e1f64c..4b105e30beb 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -189,7 +189,7 @@ void RecomputeStats(const std::vector<NnetExample> &egs, Nnet *nnet);
 
 
 /// This function affects components of child-classes of
-/// RandomComponent( currently only DropoutComponent and DropoutMaskComponent).
+/// RandomComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed often).
 /// "test mode" means that having a mask containing (1-dropout_prob) in all
@@ -296,7 +296,8 @@ void CollapseModel(const CollapseModelConfig &config,
        'remove-orphans'.
 
     set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion>
-       Sets the dropout rates for any components of type DropoutComponent whose
+       Sets the dropout rates for any components of type DropoutComponent,
+       DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
     apply-svd name=<name-pattern> bottleneck-dim=<dim>