diff --git a/egs/mini_librispeech/s5/local/chain/compare_wer.sh b/egs/mini_librispeech/s5/local/chain/compare_wer.sh
index cd6be14ed88..8ee5db2326a 100755
--- a/egs/mini_librispeech/s5/local/chain/compare_wer.sh
+++ b/egs/mini_librispeech/s5/local/chain/compare_wer.sh
@@ -129,3 +129,9 @@ for x in $*; do
   printf "% 10s" $prob
 done
 echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index 75da1a0a553..cb5756188a4 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1e.sh
\ No newline at end of file
+tuning/run_tdnn_1f.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
new file mode 100755
index 00000000000..9cc6d93022a
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+
+# 1f is as 1e but a smaller model with various tuning changes, the most
+#  important of which is the 'bottleneck-dim' option for the last layer;
+#  also dimensions are reduced and we've removed the 'target-rms=0.5' options
+#  on the prefinal layers.
+#
+# local/chain/compare_wer.sh --online exp/chain/tdnn1{e,f}_sp 2>/dev/null
+# local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
+# System                tdnn1e_sp tdnn1f_sp
+#WER dev_clean_2 (tgsmall)      14.11     13.91
+#             [online:]         14.07     13.96
+#WER dev_clean_2 (tglarge)      10.15      9.95
+#             [online:]         10.16     10.13
+# Final train prob        -0.0503   -0.0508
+# Final valid prob        -0.0887   -0.0917
+# Final train prob (xent)   -1.4257   -1.3509
+# Final valid prob (xent)   -1.6799   -1.5883
+# Num-params                 7508490   4205322
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp
+# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089)
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1f   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=384
+  relu-batchnorm-layer name=tdnn2 $opts dim=384 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=384
+  relu-batchnorm-layer name=tdnn4 $opts dim=384 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=384
+  relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=384
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=384
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index c4c3d182bfe..6412a46e86a 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -8,10 +8,18 @@
 echo "# $0 $*";  # print command line.
 
 include_looped=false
-if [ "$1" == "--looped" ]; then
-  include_looped=true
-  shift
-fi
+include_rt03=false
+
+for x in $(seq 3); do
+  if [ "$1" == "--looped" ]; then
+    include_looped=true
+    shift
+  fi
+  if [ "$1" == "--rt03" ]; then
+    include_rt03=true
+    shift
+  fi
+done
 
 echo -n "# System               "
 for x in $*; do   printf " % 9s" $x;   done
@@ -120,6 +128,46 @@ if $include_looped; then
 fi
 
 
+if $include_rt03; then
+  echo -n "# WER on rt03(tg)      "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+
+  if $include_looped; then
+    echo -n "#           [looped:]  "
+    for x in $*; do
+      set_names $x
+      wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
+      printf "% 10s" $wer
+    done
+    echo
+  fi
+
+  echo -n "# WER on rt03(fg)      "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+
+  if $include_looped; then
+    echo -n "#           [looped:]  "
+    for x in $*; do
+      set_names $x
+      wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+      printf "% 10s" $wer
+    done
+    echo
+  fi
+fi
+
+
+
 if $used_epochs; then
   # we don't print the probs in this case.
   exit 0
diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index a9a47fd3990..4a39dfb66ac 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7m.sh
\ No newline at end of file
+tuning/run_tdnn_7n.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
index fbc28248491..aabb9e18659 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1n.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 552e944c05a..03b1ee3c97f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -31,6 +31,7 @@ speed_perturb=true
 dir=exp/chain/tdnn_7m  # Note: _sp will get added to this if $speed_perturb == true.
 decode_iter=
 decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 # training options
 num_epochs=4
@@ -214,7 +215,7 @@ if [ ! -z $decode_iter ]; then
 fi
 if [ $stage -le 15 ]; then
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
@@ -243,7 +244,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then
        $lang exp/nnet3/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
     (
       # note: we just give it "$decode_set" as it only uses the wav.scp, the
       # feature type does not matter.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
new file mode 100755
index 00000000000..cf4855db611
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+
+# 7n is a kind of factorized TDNN, with skip connections.  We have to write
+# a proper description for this.  Note: I'm not happy with how
+
+# The following compares this with our old tdnn_lstm system before kaldi 5.4
+# (from run_tdnn_lstm_1m.sh), and with our old TDNN system.  It's over 1.5%
+# absolute better than our old TDNN system, and even a bit better than our old
+# TDNN+LSTM with dropout.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp
+# WER on train_dev(tg)      12.33     13.70     12.18
+# WER on train_dev(fg)      11.42     12.67     11.12
+# WER on eval2000(tg)        15.2      16.6      14.9
+# WER on eval2000(fg)        13.8      15.1      13.5
+# WER on rt03(tg)            18.6      20.9      18.4
+# WER on rt03(fg)            16.3      18.3      16.2
+# Final train prob         -0.082    -0.085    -0.077
+# Final valid prob         -0.099    -0.103    -0.093
+# Final train prob (xent)        -0.959    -1.230    -0.994
+# Final valid prob (xent)       -1.0305   -1.2704   -1.0194
+# Num-parameters               39558436  16292693  20111396
+
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn7m23t_sp
+# exp/chain/tdnn7m23t_sp: num-iters=394 nj=3..16 num-params=20.1M dim=40+100->6034 combine=-0.083->-0.081 (over 20) xent:train/valid[261,393,final]=(-1.05,-0.991,-0.994/-1.09,-1.02,-1.02) logprob:train/valid[261,393,final]=(-0.085,-0.077,-0.077/-0.100,-0.095,-0.093)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7n
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index 1d566290163..b50692616c4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -6,11 +6,14 @@
 # After comparing different combinations of dropout(with or without) and decay-time
 # option(20, 40 or without), we found this setup is best.
 
-#System                   tdnn_lstm_1l_ld5  tdnn_lstm_1m_ld   1m_online 
+#System                   tdnn_lstm_1l_ld5  tdnn_lstm_1m_ld   1m_online
 #WER on train_dev(tg)         12.41             12.37           12.21
 #WER on train_dev(fg)         11.59             11.46           11.41
 #WER on eval2000(tg)          14.8              14.8            14.9
 #WER on eval2000(fg)          13.5              13.5            13.6
+# WER on rt03(tg)                               18.6
+# WER on rt03(fg)                               16.3
+
 #Final train prob             -0.069            -0.081
 #Final valid prob             -0.095            -0.100
 #Final train prob (xent)      -0.913            -0.950
@@ -30,6 +33,7 @@ dir=exp/chain/tdnn_lstm_1m # Note: _sp will get added to this if $speed_perturb
 decode_iter=
 decode_dir_affix=
 decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 # training options
 leftmost_questions_truncate=-1
@@ -227,7 +231,7 @@ if [ $stage -le 15 ]; then
   if [ ! -z $decode_iter ]; then
     iter_opts=" --iter $decode_iter "
   fi
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
       (
        steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
           --nj 50 --cmd "$decode_cmd" $iter_opts \
@@ -257,7 +261,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then
        $lang exp/nnet3/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
     (
       # note: we just give it "$decode_set" as it only uses the wav.scp, the
       # feature type does not matter.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
new file mode 100755
index 00000000000..9cb182b2915
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -0,0 +1,284 @@
+#!/bin/bash
+
+
+# 1n is as 1m but with significant changes, replacing TDNN layers with a
+# structure like run_tdnn_7n.sh.  Seems better!  But the improvement
+# versus the best TDNN system (see run_tdnn_7n.sh) is so small that it's
+# not really worth it when you consider how much slower it is.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online
+# System                tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online
+# WER on train_dev(tg)      12.33     12.21     12.38     12.49
+# WER on train_dev(fg)      11.42     11.41     11.48     11.59
+# WER on eval2000(tg)        15.2      15.1      15.0      14.9
+# WER on eval2000(fg)        13.8      13.8      13.5      13.5
+# WER on rt03(tg)            18.6      18.4      18.0      18.0
+# WER on rt03(fg)            16.3      16.1      15.8      15.8
+# Final train prob         -0.082     0.000    -0.084     0.000
+# Final valid prob         -0.099     0.000    -0.104     0.000
+# Final train prob (xent)        -0.959     0.000    -1.154     0.000
+# Final valid prob (xent)       -1.0305    0.0000   -1.2190    0.0000
+# Num-parameters               39558436         0  27773348         0
+#
+
+
+# exp/chain/tdnn_lstm1n_sp: num-iters=394 nj=3..16 num-params=27.8M dim=40+100->6034 combine=-0.081->-0.080 (over 5) xent:train/valid[261,393,final]=(-1.59,-1.14,-1.15/-1.64,-1.22,-1.22) logprob:train/valid[261,393,final]=(-0.105,-0.086,-0.084/-0.123,-0.107,-0.104)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=1n
+decode_iter=
+decode_dir_affix=
+decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+# training options
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+remove_egs=true
+common_egs_dir=
+
+test_online_decoding=true  # if true, it will run the last decoding stage.
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=exp/chain/tdnn_lstm${affix}${suffix}
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  lstm_opts="l2-regularize=0.0005 decay-time=40"
+  output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  output-layer name=output input=lstm3  include-log-softmax=false $output_opts
+
+  output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+      /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index 00b2d29cc88..88dde1ff0e2 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -102,5 +102,10 @@ for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
+echo
 
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
 echo
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
index 75da1a0a553..cb5756188a4 120000
--- a/egs/wsj/s5/local/chain/run_tdnn.sh
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1e.sh
\ No newline at end of file
+tuning/run_tdnn_1f.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
index 8e647598556..a4fa11e0908 120000
--- a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
+++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
new file mode 100755
index 00000000000..be8d39de80b
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -0,0 +1,342 @@
+#!/bin/bash
+
+# 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the
+# end, and no chain l2-regularize
+#[note: was 1e12e.]
+
+# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp
+# System                tdnn1e10_sp tdnn1e12e_sp
+#WER dev93 (tgpr)                7.29      7.20
+#WER dev93 (tg)                  7.08      6.81
+#WER dev93 (big-dict,tgpr)       5.15      5.04
+#WER dev93 (big-dict,fg)         4.52      4.42
+#WER eval92 (tgpr)               5.12      4.80
+#WER eval92 (tg)                 4.91      4.54
+#WER eval92 (big-dict,tgpr)      2.94      2.76
+#WER eval92 (big-dict,fg)        2.57      2.30
+# Final train prob        -0.0545   -0.0455
+# Final valid prob        -0.0650   -0.0599
+# Final train prob (xent)   -0.9696   -0.9060
+# Final valid prob (xent)   -0.9917   -0.9448
+# Num-params                 8067660   6071244
+
+
+# exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065)
+# exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1f   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005 bottleneck-dim=320"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=448
+  relu-batchnorm-layer name=tdnn2 $opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=448
+  relu-batchnorm-layer name=tdnn4 $opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=448
+  relu-batchnorm-layer name=tdnn6 $opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=448 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=448
+  output-layer name=output $output_opts include-log-softmax=false dim=$num_targets
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=448
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..51fefb9ca88
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,684 @@
+#!/bin/bash
+
+
+# 1b is like 1a but instead of having 3 fast-lstm-layers, having one
+#  lstmb-layer.  Caution: although it's better than run_tdnn_lstm_1a.sh, it's
+#  still not better than run_tdnn_1f.sh, and my experience with this LSTMB layer
+#  on larger-scale setups like Switchboard has not been good.  So I *don't
+#  particularly recommend* this setup.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp  tdnn_lstm1b_sp
+#WER dev93 (tgpr)                7.64       7.24
+#WER dev93 (tg)                  7.29       7.03
+#WER dev93 (big-dict,tgpr)       5.53       5.04
+#WER dev93 (big-dict,fg)         5.14       4.92
+#WER eval92 (tgpr)               5.62       5.23
+#WER eval92 (tg)                 5.30       4.78
+#WER eval92 (big-dict,tgpr)      3.62       3.17
+#WER eval92 (big-dict,fg)        3.31       2.73
+# Final train prob        -0.0344    -0.0403
+# Final valid prob        -0.0518    -0.0526
+# Final train prob (xent)   -0.5589    -0.7406
+# Final valid prob (xent)   -0.6620    -0.7766
+# Num-params                 9106252    4216524
+
+# 1b22 is as 1b21 but setting chain.l2-regularize to zero.
+
+# 1b21 is as 1b20 but half the learning rate..
+
+# 1b20 is as 1b19b but reducing dimensions of TDNN layers from 512 to 448.
+# 1b19b is as 1b19 but with more epochs (4->6)
+# 1b19 is a rerun of 1b18d3 (a fairly small LSTM+TDNN setup).
+#
+#
+# 1b18d3 is as 1b18d2 but reducing lstm bottleneck dim from 304 to 256.
+# [1b18d2 is just a rerun of 1b18d as I merged various code changes and
+#  I want to make sure nothing bad happened.]
+#
+# Results below show it's probably slightly better than the average of 18d and 18d2
+#   (which are supposed to be the same experiment)...
+#
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18d_sp exp/chain/tdnn_lstm1b18d2_sp exp/chain/tdnn_lstm1b18d3_sp
+# System                tdnn_lstm1b18d_sp tdnn_lstm1b18d2_sp tdnn_lstm1b18d3_sp
+#WER dev93 (tgpr)                7.78      7.46      7.46
+#WER dev93 (tg)                  7.29      7.30      7.04
+#WER dev93 (big-dict,tgpr)       5.56      5.51      5.55
+#WER dev93 (big-dict,fg)         5.32      5.08      5.05
+#WER eval92 (tgpr)               5.33      5.40      5.39
+#WER eval92 (tg)                 5.05      5.03      4.96
+#WER eval92 (big-dict,tgpr)      3.42      3.26      3.35
+#WER eval92 (big-dict,fg)        2.91      2.64      2.82
+# Final train prob        -0.0529   -0.0536   -0.0543
+# Final valid prob        -0.0633   -0.0630   -0.0636
+# Final train prob (xent)   -0.8327   -0.8330   -0.8415
+# Final valid prob (xent)   -0.8693   -0.8672   -0.8695
+# Num-params                 4922060   4922060   4805324
+
+#
+# 1b18d is as 1b18c, but adding 'self-scale=2.0' to scale up the m_trunc when it is given
+# as input to the affine projections (I found previously this was helpful).
+# .. Interesting: objf improves but WER is not better.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18c_sp exp/chain/tdnn_lstm1b18d_sp
+# System                tdnn_lstm1b18c_sp tdnn_lstm1b18d_sp
+#WER dev93 (tgpr)                7.77      7.78
+#WER dev93 (tg)                  7.40      7.29
+#WER dev93 (big-dict,tgpr)       5.39      5.56
+#WER dev93 (big-dict,fg)         5.25      5.32
+#WER eval92 (tgpr)               5.48      5.33
+#WER eval92 (tg)                 4.98      5.05
+#WER eval92 (big-dict,tgpr)      3.07      3.42
+#WER eval92 (big-dict,fg)        2.69      2.91
+# Final train prob        -0.0546   -0.0529
+# Final valid prob        -0.0641   -0.0633
+# Final train prob (xent)   -0.8679   -0.8327
+# Final valid prob (xent)   -0.8954   -0.8693
+# Num-params                 4922060   4922060
+
+# 1b18c is as 1b18b, but fixing a bug in the script whereby c instead of m had been used
+# as input to the affine projections.
+
+# 1b18b is as 1b18, but doubling l2 regularization on the output
+#  and lstm layers, parts of them were training too slowly.
+#
+# 1b18 is as 1b17, but via script change, not using memory-norm (actually
+#   this is the same as 1b17d).
+#  I don't see any WER change, but objf is worse.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b17_sp exp/chain/tdnn_lstm1b17d_sp exp/chain/tdnn_lstm1b18_sp
+# System                tdnn_lstm1b17_sp tdnn_lstm1b17d_sp tdnn_lstm1b18_sp
+#WER dev93 (tgpr)                7.49      7.44      7.48
+#WER dev93 (tg)                  7.18      7.13      7.19
+#WER dev93 (big-dict,tgpr)       5.50      5.34      5.48
+#WER dev93 (big-dict,fg)         5.11      5.15      5.04
+#WER eval92 (tgpr)               5.26      5.32      5.32
+#WER eval92 (tg)                 5.00      4.94      5.03
+#WER eval92 (big-dict,tgpr)      3.24      3.28      3.26
+#WER eval92 (big-dict,fg)        2.82      2.80      2.84
+# Final train prob        -0.0489   -0.0486   -0.0496
+# Final valid prob        -0.0583   -0.0599   -0.0612
+# Final train prob (xent)   -0.7550   -0.7809   -0.7749
+# Final valid prob (xent)   -0.7988   -0.8121   -0.8131
+# Num-params                 4922060   4922060   4922060
+
+# 1b17 is as 1b13m, it's just a rerun after some code changes (adding
+# diagonal natural gradient stuff) which should make no difference.
+# Still seems to be working.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp exp/chain/tdnn_lstm1b17_sp
+# System                tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp tdnn_lstm1b17_sp
+#WER dev93 (tgpr)                7.86      7.43      7.49
+#WER dev93 (tg)                  7.40      7.00      7.18
+#WER dev93 (big-dict,tgpr)       5.65      5.21      5.50
+#WER dev93 (big-dict,fg)         5.11      4.76      5.11
+#WER eval92 (tgpr)               5.64      5.39      5.26
+#WER eval92 (tg)                 5.17      5.00      5.00
+#WER eval92 (big-dict,tgpr)      3.21      3.30      3.24
+#WER eval92 (big-dict,fg)        2.84      2.62      2.82
+# Final train prob        -0.0469   -0.0516   -0.0489
+# Final valid prob        -0.0601   -0.0607   -0.0583
+# Final train prob (xent)   -0.7424   -0.7593   -0.7550
+# Final valid prob (xent)   -0.7920   -0.7982   -0.7988
+# Num-params                 5456076   4922060   4922060
+
+# 1b13m is as 1b13l, but reverting the LSTM script "fix" (which actually
+#  made things worse), so the baseline is 1b13{c,d} (and the change versus
+# c,d is to add bottleneck-dim=256).
+#
+# It's helpful:
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp
+# System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp
+#WER dev93 (tgpr)                7.68      7.86      7.43
+#WER dev93 (tg)                  7.34      7.40      7.00
+#WER dev93 (big-dict,tgpr)       5.42      5.65      5.21
+#WER dev93 (big-dict,fg)         5.05      5.11      4.76
+#WER eval92 (tgpr)               5.48      5.64      5.39
+#WER eval92 (tg)                 5.26      5.17      5.00
+#WER eval92 (big-dict,tgpr)      3.23      3.21      3.30
+#WER eval92 (big-dict,fg)        2.82      2.84      2.62
+# Final train prob        -0.0490   -0.0469   -0.0516
+# Final valid prob        -0.0597   -0.0601   -0.0607
+# Final train prob (xent)   -0.7549   -0.7424   -0.7593
+# Final valid prob (xent)   -0.7910   -0.7920   -0.7982
+# Num-params                 5456076   5456076   4922060
+#
+#
+# 1b13l is as 1b13k, but adding bottleneck-dim=256 to the output layers.
+#  Definitely helpful:
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13k_sp exp/chain/tdnn_lstm1b13l_sp
+# System                tdnn_lstm1b13k_sp tdnn_lstm1b13l_sp
+#WER dev93 (tgpr)                7.94      7.46
+#WER dev93 (tg)                  7.68      7.09
+#WER dev93 (big-dict,tgpr)       5.91      5.39
+#WER dev93 (big-dict,fg)         5.56      4.94
+#WER eval92 (tgpr)               5.65      5.44
+#WER eval92 (tg)                 5.32      5.09
+#WER eval92 (big-dict,tgpr)      3.49      3.15
+#WER eval92 (big-dict,fg)        3.07      2.94
+# Final train prob        -0.0491   -0.0513
+# Final valid prob        -0.0600   -0.0599
+# Final train prob (xent)   -0.7395   -0.7490
+# Final valid prob (xent)   -0.7762   -0.7860
+# Num-params                 5456076   4922060
+
+# 1b13k is as 1b13d, but after a script fix: previously we were using the 'c'
+# for the full-matrix part of the recurrence instead of the 'm'.
+
+# 1b13d is as 1b13c, but a rerun after fixing a code bug whereby the natural gradient
+# for the LinearComponent was turned off by default when initializing from config.
+#   **Update: turns out there was no difference here, the code had been ignoring
+#     that config variable.**
+#
+# It seems to optimize better, although the WER change is unclear.  However, it's
+# interesting that the average objf in the individual training jobs (train.*.log) is not better-
+# but in compute_prob_train.*.log it is.  It seems that the natural gradient interacts
+# well with model averaging, which is what we found previously in the NG paper.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp
+# System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp
+#WER dev93 (tgpr)                7.68      7.86
+#WER dev93 (tg)                  7.34      7.40
+#WER dev93 (big-dict,tgpr)       5.42      5.65
+#WER dev93 (big-dict,fg)         5.05      5.11
+#WER eval92 (tgpr)               5.48      5.64
+#WER eval92 (tg)                 5.26      5.17
+#WER eval92 (big-dict,tgpr)      3.23      3.21
+#WER eval92 (big-dict,fg)        2.82      2.84
+# Final train prob        -0.0490   -0.0469
+# Final valid prob        -0.0597   -0.0601
+# Final train prob (xent)   -0.7549   -0.7424
+# Final valid prob (xent)   -0.7910   -0.7920
+# Num-params                 5456076   5456076
+#
+#
+# 1b13c is as 1b13b, but after script change in which the lstmb layer was
+# rewritten, adding memnorm and removing the scale of 4.0, along with some
+#  more minor changes and streamlining/removing options.
+#
+# 1b13b is as 1b13, but a rerun after merging with the memnorm-and-combine
+#   branch.  Slight difference in num-params is because of 300 vs 304.
+
+# 1b13 is as 1b10 but reducing the bottleneck dim to 304
+# (because I want to get in the habit of using multiples of 8).
+#  WER seems improved.
+#
+#
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b10_sp exp/chain/tdnn_lstm1b13_sp
+# System                tdnn_lstm1b10_sp tdnn_lstm1b13_sp
+#WER dev93 (tgpr)                7.87      7.63
+#WER dev93 (tg)                  7.48      7.46
+#WER dev93 (big-dict,tgpr)       5.55      5.56
+#WER dev93 (big-dict,fg)         5.25      5.09
+#WER eval92 (tgpr)               5.44      5.48
+#WER eval92 (tg)                 5.05      5.12
+#WER eval92 (big-dict,tgpr)      3.24      3.17
+#WER eval92 (big-dict,fg)        2.73      2.60
+# Final train prob        -0.0463   -0.0470
+# Final valid prob        -0.0561   -0.0565
+# Final train prob (xent)   -0.7362   -0.7588
+# Final valid prob (xent)   -0.7730   -0.7831
+# Num-params                 5650636   5446348
+
+# 1b10 is as 1b9 but reducing the cell and bottleneck dimension of LSTM layer from 512 to 384.
+# Seems helpful on average-- nice!
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b9_sp exp/chain/tdnn_lstm1b10_sp
+# System                tdnn_lstm1b9_sp tdnn_lstm1b10_sp
+#WER dev93 (tgpr)                7.74      7.87
+#WER dev93 (tg)                  7.46      7.48
+#WER dev93 (big-dict,tgpr)       5.67      5.55
+#WER dev93 (big-dict,fg)         5.31      5.25
+#WER eval92 (tgpr)               5.60      5.44
+#WER eval92 (tg)                 5.42      5.05
+#WER eval92 (big-dict,tgpr)      3.47      3.24
+#WER eval92 (big-dict,fg)        3.07      2.73
+# Final train prob        -0.0413   -0.0463
+# Final valid prob        -0.0543   -0.0561
+# Final train prob (xent)   -0.6786   -0.7362
+# Final valid prob (xent)   -0.7249   -0.7730
+# Num-params                 7021644   5650636
+
+# 1b9 is as 1b8 but adding batchnorm after the LSTM layer.. this is
+#  to correct an oversight.
+# 1b8 is as 1b7 but with quite a few layers removed.  WER effect is unclear.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b7_sp exp/chain/tdnn_lstm1b8_sp
+# System                tdnn_lstm1b7_sp tdnn_lstm1b8_sp
+#WER dev93 (tgpr)                7.31      7.60
+#WER dev93 (tg)                  7.10      7.25
+#WER dev93 (big-dict,tgpr)       5.26      5.26
+#WER dev93 (big-dict,fg)         4.64      4.93
+#WER eval92 (tgpr)               5.48      5.32
+#WER eval92 (tg)                 5.00      5.07
+#WER eval92 (big-dict,tgpr)      3.35      3.31
+#WER eval92 (big-dict,fg)        2.99      2.84
+# Final train prob        -0.0483   -0.0533
+# Final valid prob        -0.0573   -0.0627
+# Final train prob (xent)   -0.7207   -0.8234
+# Final valid prob (xent)   -0.7467   -0.8466
+# Num-params                11752524   7021644
+
+# 1b7 is as 1b6 but adding self-stabilize=true and normalize-type=none;
+# and after a script-level change that scale 'c' by 4 before giving it
+# to the W_all_a matrix (to see where all this came from, look at run_tdnn_lstm_1b16.sh
+# in the mini_librispeech setup, although by the time you see this, that may no longer exist).
+#
+# 1b6 is as 1b3 but replacing renorm with batchnorm for the TDNN layers,
+# and adding batchnorm to the LSTMB layers.  Effect on WER unclear but generally
+# it's better.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1{a2,a3,b3,b6}_sp
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a2_sp exp/chain/tdnn_lstm1a3_sp exp/chain/tdnn_lstm1b3_sp exp/chain/tdnn_lstm1b6_sp
+# System                tdnn_lstm1a2_sp tdnn_lstm1a3_sp tdnn_lstm1b3_sp tdnn_lstm1b6_sp
+#WER dev93 (tgpr)                7.47      7.65      7.26      7.32
+#WER dev93 (tg)                  7.29      7.24      6.96      6.98
+#WER dev93 (big-dict,tgpr)       5.44      5.60      5.43      5.22
+#WER dev93 (big-dict,fg)         4.98      5.04      4.97      4.86
+#WER eval92 (tgpr)               5.78      5.21      5.30      5.14
+#WER eval92 (tg)                 5.44      5.00      4.87      4.82
+#WER eval92 (big-dict,tgpr)      3.35      3.23      3.42      3.24
+#WER eval92 (big-dict,fg)        2.99      2.96      3.03      2.82
+# Final train prob        -0.0447   -0.0410   -0.0484   -0.0503
+# Final valid prob        -0.0566   -0.0518   -0.0594   -0.0599
+# Final train prob (xent)   -0.6859   -0.6676   -0.7528   -0.7415
+# Final valid prob (xent)   -0.7378   -0.7230   -0.8078   -0.7804
+# Num-params                 9106252   9106252  11747916  11746380
+
+# 1b3 is as 1a2 but with the same change as in a->b, replacing lstmp with lstmb
+# 1a2 is as 1a but adding l2 regularization.
+
+# this is a TDNN+LSTM chain system.
+# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
+# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
+# Note: we're using the same hidden-layer sizes as
+# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
+# fact that we'd normally choose a smaller model for a setup with
+# less data, because the Tedlium model was probably on the small side.
+# Note: we normally use more parameters for LSTM-containing than TDNN-only
+# systems.
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
+
+# The following compares:
+# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
+# system.
+# This is consistently better than the nnet3 TDNN+LSTM, but the
+# difference with the chain TDNN is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
+# System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                8.54      7.87      7.48
+#             [online:]          8.57      8.02      7.49
+#WER dev93 (tg)                  8.25      7.61      7.41
+#             [online:]          8.34      7.70      7.40
+#WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
+#             [online:]          6.40      5.60      5.70
+#WER dev93 (big-dict,fg)         5.70      5.10      5.40
+#             [online:]          5.77      5.21      5.19
+#WER eval92 (tgpr)               6.52      5.23      5.67
+#             [online:]          6.56      5.44      5.60
+#WER eval92 (tg)                 6.13      4.87      5.46
+#             [online:]          6.24      4.87      5.53
+#WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
+#             [online:]          3.88      3.31      3.63
+#WER eval92 (big-dict,fg)        3.38      2.71      3.28
+#             [online:]          3.53      2.92      3.31
+# Final train prob                  -0.0414   -0.0341
+# Final valid prob                  -0.0634   -0.0506
+# Final train prob (xent)             -0.8216   -0.5643
+# Final valid prob (xent)             -0.9208   -0.6648
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+label_delay=8
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005 bottleneck-dim=256"
+  lstm_opts="l2-regularize=0.005 self-scale=2.0"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda delay=5 input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=448
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts dim=448 input=Append(-3,0,3)
+  lstmb-layer name=lstm3 $lstm_opts cell-dim=384 bottleneck-dim=256 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 $output_opts output-delay=$label_delay include-log-softmax=false dim=$num_targets
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 19 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 20 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index 40e1fcecce9..ad03b557bfe 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -7,7 +7,7 @@
 
 """
 Script to combine ctms edits with overlapping segments obtained from
-smith-waterman alignment. This script is similar to resolve_ctm_edits.py,
+smith-waterman alignment. This script is similar to utils/ctm/resolve_ctm_edits.py,
 where the overlapping region is just split in two. The approach here is a
 little more advanced since we have access to the WER
 (w.r.t. the reference text). It finds the WER of the overlapped region
diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index d0fac5292c6..cda271f9724 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -139,7 +139,7 @@ sub get_combine_info {
         return sprintf(" combine=%.3f->%.3f", $1, $2);
       } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
-        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); 
+        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1);
       }
     }
   }
@@ -204,6 +204,9 @@ sub get_logprob_and_accuracy_info {
         if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
           $iter_to_train_logprob{$iter} = $1;
           $iter_to_train_penalty{$iter} = $2;
+        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
+          $iter_to_train_logprob{$iter} = $1;
+          $iter_to_train_penalty{$iter} = 0.0;
         } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
           $iter_to_train_xent{$iter} = $1;
         }
@@ -213,6 +216,9 @@ sub get_logprob_and_accuracy_info {
         if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
           $iter_to_valid_logprob{$iter} = $1;
           $iter_to_valid_penalty{$iter} = $2;
+        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
+          $iter_to_valid_logprob{$iter} = $1;
+          $iter_to_valid_penalty{$iter} = 0.0;
         } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
           $iter_to_valid_xent{$iter} = $1;
         }
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index d5f2575d582..905edc1a78b 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -388,8 +388,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
                 " key {k} in both {tl} and {vl}".format(
                     k=key, tl=train_prob_files, vl=valid_prob_files))
     iters.sort()
-    return map(lambda x: (int(x), float(train_objf[x]),
-                          float(valid_objf[x])), iters)
+    return list(map(lambda x: (int(x), float(train_objf[x]),
+                               float(valid_objf[x])), iters))
 
 
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 5b640510ea1..3df2720b2c0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -128,7 +128,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch_str,
-                     frame_subsampling_factor, run_opts,
+                     frame_subsampling_factor, run_opts, train_opts,
                      backstitch_training_scale=0.0, backstitch_training_interval=1):
     """
     Called from train_one_iteration(), this method trains new models
@@ -184,7 +184,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --max-param-change={max_param_change} \
                     --backstitch-training-scale={backstitch_training_scale} \
                     --backstitch-training-interval={backstitch_training_interval} \
-                    --l2-regularize-factor={l2_regularize_factor} \
+                    --l2-regularize-factor={l2_regularize_factor} {train_opts} \
                     --srand={srand} \
                     "{raw_model}" {dir}/den.fst \
                     "ark,bg:nnet3-chain-copy-egs \
@@ -201,6 +201,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         deriv_time_opts=" ".join(deriv_time_opts),
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
+                        train_opts=train_opts,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
                         cache_io_opts=cache_io_opts,
                         parallel_train_opts=run_opts.parallel_train_opts,
@@ -233,7 +234,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
                         frame_subsampling_factor,
-                        run_opts, dropout_edit_string="",
+                        run_opts, dropout_edit_string="", train_opts="",
                         backstitch_training_scale=0.0, backstitch_training_interval=1):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -306,7 +307,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      shuffle_buffer_size=shuffle_buffer_size,
                      num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                      frame_subsampling_factor=frame_subsampling_factor,
-                     run_opts=run_opts,
+                     run_opts=run_opts, train_opts=train_opts,
                      # linearly increase backstitch_training_scale during the
                      # first few iterations (hard-coded as 15)
                      backstitch_training_scale=(backstitch_training_scale *
@@ -387,8 +388,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                         range(1, num_lda_jobs + 1))
+    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                              range(1, num_lda_jobs + 1)))
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
@@ -480,14 +481,34 @@ def compute_progress(dir, iter, run_opts):
     common_lib.background_command(
         """{command} {dir}/log/progress.{iter}.log \
                 nnet3-am-info {model} '&&' \
-                nnet3-show-progress --use-gpu=no \
-                    "nnet3-am-copy --raw=true {prev_model} - |" \
-                    "nnet3-am-copy --raw=true {model} - |"
+                nnet3-show-progress --use-gpu=no {prev_model} {model}
         """.format(command=run_opts.command,
                    dir=dir,
                    iter=iter,
                    model=model,
                    prev_model=prev_model))
+    if iter % 10 == 0 and iter > 0:
+        # Every 10 iters, print some more detailed information.
+        # full_progress.X.log contains some diagnostics of the difference in
+        # parameters, printed in the same format as from nnet3-info.
+        common_lib.background_command(
+            """{command} {dir}/log/full_progress.{iter}.log \
+            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   prev_model=prev_model))
+        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
+        # option which includes stats on the singular values of the parameter matrices.
+        common_lib.background_command(
+            """{command} {dir}/log/full_info.{iter}.log \
+            nnet3-info --verbose=2 {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model))
+
 
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, leaky_hmm_coefficient, l2_regularize,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 2b4fdd92cec..443834fc161 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -531,7 +531,7 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
         scales.append(math.pow(pdf_counts[i] + smooth * average_count,
                                presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
-    scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+    scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales))
     return scaled_counts
 
 
@@ -903,6 +903,11 @@ def __init__(self,
                                  lstm*=0,0.2,0'.  More general should precede
                                  less general patterns, as they are applied
                                  sequentially.""")
+        self.parser.add_argument("--trainer.add-option", type=str,
+                                 dest='train_opts', action='append', default=[],
+                                 help="""You can use this to add arbitrary options that
+                                 will be passed through to the core training code (nnet3-train
+                                 or nnet3-chain-train)""")
         self.parser.add_argument("--trainer.optimization.backstitch-training-scale",
                                  type=float, dest='backstitch_training_scale',
                                  default=0.0, help="""scale of parameters changes
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 8bdcd160409..9dd12e63f52 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -9,6 +9,8 @@
 network without transition model) with frame-level objectives.
 """
 
+from __future__ import print_statement
+from __future__ import division
 import glob
 import logging
 import math
@@ -31,7 +33,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      image_augmentation_opts,
                      run_opts, frames_per_eg=-1,
                      min_deriv_time=None, max_deriv_time_relative=None,
-                     use_multitask_egs=False,
+                     use_multitask_egs=False, train_opts="",
                      backstitch_training_scale=0.0, backstitch_training_interval=1):
     """ Called from train_one_iteration(), this model does one iteration of
     training with 'num_jobs' jobs, and writes files like
@@ -91,7 +93,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         archive_index = (k % num_archives) + 1
 
         if not chunk_level_training:
-            frame = (k / num_archives + archive_index) % frames_per_eg
+            frame = (k // num_archives + archive_index) % frames_per_eg
 
         cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                   iter=iter)
@@ -142,7 +144,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --backstitch-training-scale={backstitch_training_scale} \
                     --l2-regularize-factor={l2_regularize_factor} \
                     --backstitch-training-interval={backstitch_training_interval} \
-                    --srand={srand} \
+                    --srand={srand} {train_opts} \
                     {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \
                     {dir}/{next_iter}.{job}.raw""".format(
                 command=run_opts.command,
@@ -157,6 +159,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                 l2_regularize_factor=1.0/num_jobs,
                 backstitch_training_scale=backstitch_training_scale,
                 backstitch_training_interval=backstitch_training_interval,
+                train_opts=train_opts,
                 deriv_time_opts=" ".join(deriv_time_opts),
                 raw_model=raw_model_string,
                 egs_rspecifier=egs_rspecifier),
@@ -175,9 +178,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         run_opts, image_augmentation_opts=None,
                         frames_per_eg=-1,
                         min_deriv_time=None, max_deriv_time_relative=None,
-                        shrinkage_value=1.0, dropout_edit_string="",
-                        get_raw_nnet_from_am=True,
-                        use_multitask_egs=False,
+                        shrinkage_value=1.0, dropout_edit_string="",  train_opts="",
+                        get_raw_nnet_from_am=True, use_multitask_egs=False,
                         backstitch_training_scale=0.0, backstitch_training_interval=1,
                         compute_per_dim_accuracy=False):
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
@@ -277,6 +279,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      max_deriv_time_relative=max_deriv_time_relative,
                      image_augmentation_opts=image_augmentation_opts,
                      use_multitask_egs=use_multitask_egs,
+                     train_opts=train_opts,
                      backstitch_training_scale=backstitch_training_scale,
                      backstitch_training_interval=backstitch_training_interval)
 
@@ -344,8 +347,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                         range(1, num_lda_jobs + 1))
+    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                              range(1, num_lda_jobs + 1)))
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
@@ -447,6 +450,29 @@ def compute_progress(dir, iter, egs_dir,
         ''.format(command=run_opts.command, dir=dir,
                   iter=iter, model=model, prev_model=prev_model))
 
+    if iter % 10 == 0 and iter > 0:
+        # Every 10 iters, print some more detailed information.
+        # full_progress.X.log contains some diagnostics of the difference in
+        # parameters, printed in the same format as from nnet3-info.
+        common_lib.background_command(
+            """{command} {dir}/log/full_progress.{iter}.log \
+            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   prev_model=prev_model))
+        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
+        # option which includes stats on the singular values of the parameter matrices.
+        common_lib.background_command(
+            """{command} {dir}/log/full_info.{iter}.log \
+            nnet3-info --verbose=2 {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model))
+
+
 
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    minibatch_size_str,
@@ -553,7 +579,7 @@ def get_realign_iters(realign_times, num_iters,
                                      + realign_time * math.pow(num_jobs_final,
                                                                2))
             realign_iter = realign_iter - num_jobs_initial
-            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter // (num_jobs_final - num_jobs_initial)
             realign_iter = realign_iter * num_iters
         realign_iters.append(int(realign_iter))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 05ae5bcdc18..a3dfa89cf0e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -402,8 +402,7 @@ def get_full_config(self):
         # the input layers need to be printed in 'init.config' (which
         # initializes the neural network prior to the LDA), in 'ref.config',
         # which is a version of the config file used for getting left and right
-        # context (it doesn't read anything for the LDA-like transform and/or
-        # presoftmax-prior-scale components)
+        # context (it doesn't read anything for the LDA-like transform).
         # In 'full.config' we write everything, this is just for reference,
         # and also for cases where we don't use the LDA-like transform.
         ans = []
@@ -430,6 +429,9 @@ class XconfigOutputLayer(XconfigLayerBase):
     Parameters of the class, and their defaults:
         input='[-1]'    :   Descriptor giving the input of the layer.
         dim=None    :   Output dimension of layer, will normally equal the number of pdfs.
+        bottleneck-dim=None    :   Bottleneck dimension of layer: if supplied, instead of
+                        an affine component we'll have a linear then affine, so a linear
+                        bottleneck, with the linear part constrained to be orthonormal.
         include-log-softmax=true    :   setting it to false will omit the
             log-softmax component- useful for chain models.
         objective-type=linear   :   the only other choice currently is
@@ -441,16 +443,6 @@ class XconfigOutputLayer(XconfigLayerBase):
             learning-rate-factor=(0.5/xent_regularize),
             normally learning-rate-factor=5.0 since xent_regularize is
             normally 0.1.
-        presoftmax-scale-file=None  :   If set, a filename for a vector that
-            will be used to scale the output of the affine component before the
-            log-softmax (if include-log-softmax=true), or before the output
-            (if not).  This is helpful to avoid instability in training due to
-            some classes having much more data than others.  The way we normally
-            create this vector is to take the priors of the classes to the
-            power -0.25 and rescale them so the average is 1.0.  This factor
-            -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
-            the scripts this would normally be set to
-            config_dir/presoftmax_prior_scale.vec
         max-change=1.5 :  Can be used to change the max-change parameter in the
             affine component; this affects how much the matrix can change on each
             iteration.
@@ -462,6 +454,9 @@ class XconfigOutputLayer(XconfigLayerBase):
         ng-affine-options=''  :   Can be used supply non-default options to the affine
              layer (intended for the natural gradient but can be an arbitrary string
              to be added to the config line.  e.g. 'update-period=2'.).
+        ng-linear-options=''  :   Options, like ng-affine-options, that are passed to
+             the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim
+             is supplied).
     """
 
     def __init__(self, first_token, key_to_value, prev_names=None):
@@ -475,13 +470,15 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = {'input': '[-1]',
                        'dim': -1,
+                       'bottleneck-dim': -1,
+                       'orthonormal-constraint': 1.0,
+                            # orthonormal-constraint only matters if bottleneck-dim is set.
                        'include-log-softmax': True,
                             # this would be false for chain models
                        'objective-type': 'linear',
                             # see Nnet::ProcessOutputNodeConfigLine in
                             # nnet-nnet.cc for other options
                        'learning-rate-factor': 1.0,
-                       'presoftmax-scale-file': '',
                             # used in DNN (not RNN) training when using
                             # frame-level objfns,
                        'max-change': 1.5,
@@ -489,7 +486,8 @@ def set_default_configs(self):
                        'bias-stddev': 0.0,
                        'l2-regularize': 0.0,
                        'output-delay': 0,
-                       'ng-affine-options': ''
+                       'ng-affine-options': '',
+                       'ng-linear-options': ''    # only affects bottleneck output layers.
                       }
 
     def check_configs(self):
@@ -533,8 +531,20 @@ def output_dim(self, auxiliary_output=None):
                            " layers")
 
     def get_full_config(self):
-
         ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+
+    def _generate_config(self):
+
+        configs = []
 
         # note: each value of self.descriptors is (descriptor, dim,
         # normalized-string, output-string).
@@ -543,10 +553,10 @@ def get_full_config(self):
         descriptor_final_string = self.descriptors['input']['final-string']
         input_dim = self.descriptors['input']['dim']
         output_dim = self.config['dim']
+        bottleneck_dim = self.config['bottleneck-dim']
         objective_type = self.config['objective-type']
         learning_rate_factor = self.config['learning-rate-factor']
         include_log_softmax = self.config['include-log-softmax']
-        presoftmax_scale_file = self.config['presoftmax-scale-file']
         param_stddev = self.config['param-stddev']
         bias_stddev = self.config['bias-stddev']
         l2_regularize = self.config['l2-regularize']
@@ -558,64 +568,72 @@ def get_full_config(self):
         l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                 if l2_regularize != 0.0 else '')
 
-        # note: ref.config is used only for getting the left-context and
-        # right-context of the network;
-        # final.config is where we put the actual network definition.
-        for config_name in ['ref', 'final']:
-            # First the affine node.
-            line = ('component name={0}.affine'
-                    ' type=NaturalGradientAffineComponent'
-                    ' input-dim={1}'
-                    ' output-dim={2}'
-                    ' param-stddev={3}'
-                    ' bias-stddev={4}'
-                    ' max-change={5} {6} {7} {8}'
-                    ''.format(self.name, input_dim, output_dim,
-                              param_stddev, bias_stddev, max_change, ng_affine_options,
-                              learning_rate_option, l2_regularize_option))
-            ans.append((config_name, line))
-
-            line = ('component-node name={0}.affine'
-                    ' component={0}.affine input={1}'
-                    ''.format(self.name, descriptor_final_string))
-            ans.append((config_name, line))
-            cur_node = '{0}.affine'.format(self.name)
-
-            if presoftmax_scale_file is not '' and config_name == 'final':
-                # don't use the presoftmax-scale in 'ref.config' since that
-                # file won't exist at the time we evaluate it.
-                # (ref.config is used to find the left/right context).
-                line = ('component name={0}.fixed-scale'
-                        ' type=FixedScaleComponent scales={1}'
-                        ''.format(self.name, presoftmax_scale_file))
-                ans.append((config_name, line))
-
-                line = ('component-node name={0}.fixed-scale'
-                        ' component={0}.fixed-scale input={1}'
-                        ''.format(self.name, cur_node))
-                ans.append((config_name, line))
-                cur_node = '{0}.fixed-scale'.format(self.name)
+        cur_node = descriptor_final_string
+        cur_dim = input_dim
+
+        if bottleneck_dim >= 0:
+            if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim:
+                raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format(
+                    bottleneck_dim))
+            # This is the bottleneck case (it doesn't necessarily imply we
+            # will be using the features from the bottleneck; it's just a factorization
+            # of the matrix into two pieces without a nonlinearity in between).
+            # We don't include the l2-regularize option because it's useless
+            # given the orthonormality constraint.
+            linear_options = self.config['ng-linear-options']
+
+            # note: by default the LinearComponent uses natural gradient.
+            line = ('component name={0}.linear type=LinearComponent '
+                    'orthonormal-constraint={1} param-stddev={2} '
+                    'input-dim={3} output-dim={4} max-change=0.75 {5}'
+                    ''.format(self.name, self.config['orthonormal-constraint'],
+                              self.config['orthonormal-constraint'] / math.sqrt(input_dim),
+                              input_dim, bottleneck_dim, linear_options))
+            configs.append(line)
+            line = ('component-node name={0}.linear component={0}.linear input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.linear'.format(self.name)
+            cur_dim = bottleneck_dim
+
+
+        line = ('component name={0}.affine'
+                ' type=NaturalGradientAffineComponent'
+                ' input-dim={1}'
+                ' output-dim={2}'
+                ' param-stddev={3}'
+                ' bias-stddev={4}'
+                ' max-change={5} {6} {7} {8}'
+                ''.format(self.name, cur_dim, output_dim,
+                          param_stddev, bias_stddev, max_change, ng_affine_options,
+                          learning_rate_option, l2_regularize_option))
+        configs.append(line)
+        line = ('component-node name={0}.affine'
+                ' component={0}.affine input={1}'
+                ''.format(self.name, cur_node))
+        configs.append(line)
+        cur_node = '{0}.affine'.format(self.name)
 
-            if include_log_softmax:
-                line = ('component name={0}.log-softmax'
-                        ' type=LogSoftmaxComponent dim={1}'
-                        ''.format(self.name, output_dim))
-                ans.append((config_name, line))
+        if include_log_softmax:
+            line = ('component name={0}.log-softmax'
+                    ' type=LogSoftmaxComponent dim={1}'
+                    ''.format(self.name, output_dim))
+            configs.append(line)
 
-                line = ('component-node name={0}.log-softmax'
-                        ' component={0}.log-softmax input={1}'
-                        ''.format(self.name, cur_node))
-                ans.append((config_name, line))
-                cur_node = '{0}.log-softmax'.format(self.name)
+            line = ('component-node name={0}.log-softmax'
+                    ' component={0}.log-softmax input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.log-softmax'.format(self.name)
 
-            if output_delay != 0:
-                cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
+        if output_delay != 0:
+            cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
 
-            line = ('output-node name={0} input={1} '
-                    'objective={2}'.format(
-                        self.name, cur_node, objective_type))
-            ans.append((config_name, line))
-        return ans
+        line = ('output-node name={0} input={1} '
+                'objective={2}'.format(
+                    self.name, cur_node, objective_type))
+        configs.append(line)
+        return configs
 
 
 class XconfigBasicLayer(XconfigLayerBase):
@@ -637,7 +655,11 @@ class XconfigBasicLayer(XconfigLayerBase):
 
     Parameters of the class, and their defaults:
       input='[-1]'             [Descriptor giving the input of the layer.]
-      dim=None                   [Output dimension of layer, e.g. 1024]
+      dim=-1                   [Output dimension of layer, e.g. 1024]
+      bottleneck-dim=-1        [If you set this, a linear bottleneck is added, so
+                                we project to first bottleneck-dim then to dim.  The
+                                first of the two matrices is constrained to be
+                                orthonormal.]
       self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
       learning-rate-factor=1.0   [This can be used to make the affine component
                                   train faster or slower].
@@ -657,12 +679,16 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = {'input': '[-1]',
                        'dim': -1,
+                       'bottleneck-dim': -1,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
                        'ng-affine-options': '',
+                       'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
                                                    # affects layers with
-                                                   # 'dropout' in the name.
+                                                   # 'dropout' in the name
+                       'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
+                                                  # mask is shared across time.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -674,6 +700,10 @@ def set_default_configs(self):
     def check_configs(self):
         if self.config['dim'] < 0:
             raise RuntimeError("dim has invalid value {0}".format(self.config['dim']))
+        b = self.config['bottleneck-dim']
+        if b >= 0 and (b >= self.config['dim'] or b == 0):
+            raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b))
+
         if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
             raise RuntimeError("self-repair-scale has invalid value {0}"
                                .format(self.config['self-repair-scale']))
@@ -751,14 +781,41 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                                    "there is a final 'renorm' component.")
 
         configs = []
-        # First the affine node.
+        cur_dim = input_dim
+        cur_node = input_desc
+
+        # First the affine node (or linear then affine, if bottleneck).
+        if self.config['bottleneck-dim'] > 0:
+            # This is the bottleneck case (it doesn't necessarily imply we
+            # will be using the features from the bottleneck; it's just a factorization
+            # of the matrix into two pieces without a nonlinearity in between).
+            # We don't include the l2-regularize option because it's useless
+            # given the orthonormality constraint.
+            linear_options = self.config['ng-linear-options']
+            for opt_name in [ 'max-change', 'learning-rate-factor' ]:
+                value = self.config[opt_name]
+                if value != '':
+                    linear_options += ' {0}={1}'.format(opt_name, value)
+
+            bottleneck_dim = self.config['bottleneck-dim']
+            # note: by default the LinearComponent uses natural gradient.
+            line = ('component name={0}.linear type=LinearComponent '
+                    'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}'
+                    ''.format(self.name, input_dim, bottleneck_dim, linear_options))
+            configs.append(line)
+            line = ('component-node name={0}.linear component={0}.linear input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.linear'.format(self.name)
+            cur_dim = bottleneck_dim
+
+
         line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
-                ''.format(self.name, input_dim, output_dim, affine_options))
+                ''.format(self.name, cur_dim, output_dim, affine_options))
         configs.append(line)
-
         line = ('component-node name={0}.affine component={0}.affine input={1}'
-                ''.format(self.name, input_desc))
+                ''.format(self.name, cur_node))
         configs.append(line)
         cur_node = '{0}.affine'.format(self.name)
 
@@ -797,8 +854,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
 
             elif nonlinearity == 'batchnorm':
                 line = ('component name={0}.{1}'
-                        ' type=BatchNormComponent dim={2}'
-                        ' target-rms={3}'
+                        ' type=BatchNormComponent dim={2} target-rms={3}'
                         ''.format(self.name, nonlinearity, output_dim,
                                   target_rms))
 
@@ -808,10 +864,31 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                line = ('component name={0}.{1} type=DropoutComponent '
-                        'dim={2} dropout-proportion={3}'.format(
-                            self.name, nonlinearity, output_dim,
-                            self.config['dropout-proportion']))
+                if not self.config['dropout-per-dim']:
+                    line = ('component name={0}.{1} type=DropoutComponent '
+                            'dim={2} dropout-proportion={3}'.format(
+                                self.name, nonlinearity, output_dim,
+                                self.config['dropout-proportion']))
+                else:
+                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
+                            'output-dim={1} dropout-proportion={2}'.format(
+                                self.name, output_dim, self.config['dropout-proportion']))
+                    configs.append(line)
+                    # note: the input to the dropout_mask component is never used, it's
+                    # just syntactically required.
+                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
+                            'input={1}'.format(self.name, cur_node))
+                    configs.append(line)
+                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
+                            'input-dim={1} output-dim={2} '.format(
+                                self.name, 2 * output_dim, output_dim))
+                    configs.append(line)
+                    line = ('component-node name={0}.dropout component={0}.dropout '
+                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
+                            ''.format(self.name, cur_node))
+                    configs.append(line)
+                    cur_node = '{0}.dropout'.format(self.name)
+                    continue
 
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 9743d0100b9..a7808131a4a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -103,7 +103,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -113,7 +113,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -258,6 +258,8 @@ def generate_lstm_config(self):
 
 # This class is for lines like
 #   'lstmp-layer name=lstm1 input=[-1] delay=-3'
+# (you can also use the name 'lstmp-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph with output projections. It can also generate
 # outputs without projection, but you could use the XconfigLstmLayer for this
 # simple LSTM.
@@ -292,7 +294,9 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "lstmp-layer"
+        # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm
+        # component.
+        assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -353,7 +357,8 @@ def auxiliary_outputs(self):
         return ['c_t']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'rp_t'
+        node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer'
+                      else 'rp_t' )
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
                 node_name = auxiliary_output
@@ -375,7 +380,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -385,7 +390,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -542,18 +547,27 @@ def generate_lstm_config(self):
         configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}"
                        "".format(name, rec_proj_dim, bptrunc_str))
 
-        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)")
         configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t"
                        "".format(name))
         configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 "
                        "dim={1}".format(name, rec_proj_dim))
         configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
 
+        if self.layer_type == "lstmp-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, rec_proj_dim + nonrec_proj_dim))
+            configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm "
+                           "input={0}.rp_t".format(name))
+
         return configs
 
 
 # This class is for lines like
 #   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
+# (you can also use the name 'fast-lstm-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph without output projections.
 # Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose
 # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
@@ -586,7 +600,7 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstm-layer"
+        assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -626,7 +640,8 @@ def auxiliary_outputs(self):
         return ['c']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'm'
+        node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer'
+                      else 'm')
         if auxiliary_output is not None:
             if auxiliary_output == 'c':
                 node_name = 'c'
@@ -647,7 +662,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -657,7 +672,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -697,43 +712,62 @@ def generate_lstm_config(self):
         # providing output to gate i and operating on an appended vector [x,r]
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
+
         configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
                        "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
                                                        affine_str, l2_regularize_option))
+
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
+
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
-        # Note from Dan: I don't remember why we are applying the backprop
-        # truncation on both c and m appended together, instead of just on c.
-        # Possibly there was some memory or speed or WER reason for it which I
-        # have forgotten about now.
-        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
+                       "{2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
-        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
+                       "IfDefined(Offset({0}.m_trunc, {2})))".format(
+                           name, input_descriptor, delay))
+
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+                       "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                           name, delay))
         # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
         #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
-        # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+
+        if self.layer_type == "fast-lstm-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, cell_dim))
+            configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
+                           "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
 
 
 # This class is for lines like
-#   'fast-lstmb-layer name=lstm1 input=[-1] delay=-3'
-# It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix
-# of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide
-# it into two matrices, with batch-norm in between to stabilize the training.
+#   'lstmb-layer name=lstm1 input=[-1] delay=-3'
+#
+# LSTMB is not something we've published; it's LSTM with a bottleneck in the
+# middle of the W_all matrix (where W_all is a matrix that combines the 8 full
+# matrices of standard LSTM).  W_all is factored into W_all_a and W_all_b, where
+# W_all_a is constrained to have orthonormal rows (this keeps it training stably).
+#
+# It also contains a couple of other improvements: W_all_b is followed by
+# trainable ScaleAndOffsetComponent (this is a bit like the idea from the
+# publication "Self-stabilized deep neural network" by Ghahramani et al).
+# And the LSTM is followed by a batchnorm component (this is by default; it's not
+# part of the layer name, like lstmb-batchnorm-layer).
+
 #
 # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
 # the dimension defaults to the same as the input.
@@ -761,32 +795,30 @@ def generate_lstm_config(self):
 #                            i.e. history since about t = t-20, can be
 #                            accumulated in c_t.]
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
-class XconfigFastLstmbLayer(XconfigLayerBase):
+class XconfigLstmbLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstmb-layer"
+        assert first_token == 'lstmb-layer'
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
-        self.config = {'input':'[-1]',
+        self.config = { 'input':'[-1]',
                         'cell-dim' : -1, # this is a required argument
                         'bottleneck-dim': -1, # this is a required argument
-                        'clipping-threshold' : 30.0,
-                        'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0,
+                        'clipping-threshold': 30.0,
+                        'zeroing-interval': 20,
+                        'zeroing-threshold': 15.0,
+                        'orthonormal-constraint': 1.0,
                         'delay' : -1,
-                        # if you want to set 'self-repair-scale' (c.f. the
-                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
-                        # add 'self-repair-scale=xxx' to
-                        # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # the recurrence scale is the scale on m_trunc, used in the
+                        # recurrence (to balance its size with the input).
+                        'self-scale' : 1.0,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
-                        'normalize-type': 'batchnorm', # can be 'batchnorm', 'renorm', or 'none'
                         'l2-regularize': 0.0,
                         'decay-time':  -1.0
                         }
-        self.c_needed = False  # keep track of whether the 'c' output is needed.
 
     def set_derived_configs(self):
         if self.config['cell-dim'] <= 0:
@@ -801,34 +833,21 @@ def check_configs(self):
                 self.config['bottleneck-dim']))
         if self.config['delay'] == 0:
             raise RuntimeError("delay cannot be zero")
-        assert self.config['normalize-type'] in ['batchnorm', 'renorm', 'none']
 
     def auxiliary_outputs(self):
-        return ['c']
+        return []
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'm'
-        if auxiliary_output is not None:
-            if auxiliary_output == 'c':
-                node_name = 'c'
-                self.c_needed = True
-            else:
-                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
-        return '{0}.{1}'.format(self.name, node_name)
+        assert auxiliary_output is None
+        return '{0}.m_batchnorm'.format(self.name)
 
     def output_dim(self, auxiliary_output = None):
-        if auxiliary_output is not None:
-            if auxiliary_output == 'c':
-                self.c_needed = True
-                return self.config['cell-dim']
-                # add code for other auxiliary_outputs here when we decide to expose them
-            else:
-                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
+        assert auxiliary_output is None
         return self.config['cell-dim']
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -838,7 +857,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -847,6 +866,7 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         bottleneck_dim = self.config['bottleneck-dim']
+        self_scale = self.config['self-scale']
         delay = self.config['delay']
         affine_str = self.config['ng-affine-options']
         l2_regularize = self.config['l2-regularize']
@@ -872,23 +892,25 @@ def generate_lstm_config(self):
 
         configs = []
 
-        # See XconfigFastLstmLayer to understand what's going on here.
-        # This differs from that code by a factorization of the W_all matrix.
+        # See XconfigFastLstmLayer to understand what's going on here.  This
+        # differs from that code by a factorization of the W_all matrix into two
+        # pieces with a smaller dimension in between (with the first of the two
+        # pieces constrained to have orthonormal rows).  Note: we don't apply l2
+        # regularization to this layer, since, with the orthonormality
+        # constraint, it's meaningless.
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, bottleneck_dim,
-                                                       affine_str, l2_regularize_option))
-        normalize_type = self.config['normalize-type']
-        if normalize_type == 'batchnorm':
-            configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format(
-                name, bottleneck_dim))
-        elif normalize_type == 'renorm':
-            configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format(
-                name, bottleneck_dim))
-
-        configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} "
+                       "orthonormal-constraint={2} output-dim={3} {4}".format(
+                           name, input_dim + cell_dim,
+                           self.config['orthonormal-constraint'],
+                           bottleneck_dim, affine_str))
+
+        configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
                        "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
                                                        affine_str, l2_regularize_option))
+        configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} "
+                       "max-change=0.75".format(name, cell_dim * 4))
+
 
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
@@ -897,32 +919,33 @@ def generate_lstm_config(self):
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
-        # Note from Dan: I don't remember why we are applying the backprop
-        # truncation on both c and m appended together, instead of just on c.
-        # Possibly there was some memory or speed or WER reason for it which I
-        # have forgotten about now.
-        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
+
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
+            name, 2 * cell_dim, bptrunc_str))
+        configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+            name, cell_dim))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
-        if normalize_type != 'none':
-            configs.append("component-node name={0}.W_{1} component={0}.W_{1} "
-                           "input={0}.W_all_a".format(name,
-                                                      normalize_type))
-            configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
-                           "input={0}.W_{1}".format(name, normalize_type))
-        else:
-            configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
-                           "input={0}.W_all_a".format(name))
+                       "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format(
+                           name, input_descriptor, self_scale, delay))
+        configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
+                       "input={0}.W_all_a".format(name))
+        configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
+                       "input={0}.W_all_b".format(name))
+
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.W_all_b, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
-        # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
-        #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
+                       "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                           name, delay))
+        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
+                       "dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
-        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
-        # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
+                       "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
@@ -933,6 +956,8 @@ def generate_lstm_config(self):
 #   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3'
 # or:
 #   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512'
+# (you can also use the name 'fast-lstmp-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP).
 # Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose
 # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
@@ -968,7 +993,7 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstmp-layer"
+        assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer']
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -1026,7 +1051,8 @@ def auxiliary_outputs(self):
         return ['c_t']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'rp'
+        node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer'
+                     else 'rp')
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
                 node_name = auxiliary_output
@@ -1048,7 +1074,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -1058,8 +1084,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
-
+    def _generate_lstm_config(self):
         # assign some variables to reduce verbosity
         name = self.name
         # in the below code we will just call descriptor_strings as descriptors for conciseness
@@ -1104,8 +1129,9 @@ def generate_lstm_config(self):
         configs.append("##  Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
         configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
-                                                       affine_str, l2_regularize_option))
+                       "output-dim={2} {3} {4}".format(
+                           name, input_dim + rec_proj_dim, cell_dim * 4,
+                           affine_str, l2_regularize_option))
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
@@ -1123,29 +1149,32 @@ def generate_lstm_config(self):
                            .format(name, dropout_proportion))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
-        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(
+        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent "
+                       "input-dim={1} output-dim={2} {3} {4}".format(
                            name, cell_dim, rec_proj_dim + nonrec_proj_dim,
                            affine_str, l2_regularize_option))
         configs.append("###  Nodes for the components above.")
-        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
+        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                        "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
+
         if dropout_proportion != -1.0:
             # note: the 'input' is a don't-care as the component never uses it; it's required
             # in component-node lines.
             configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask "
                            "input={0}.dropout_mask".format(name))
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)"
-                           .format(name, delay))
+                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), "
+                           "{0}.dropout_mask)".format(name, delay))
         else:
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                               name, delay))
         configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "
                        "dim-offset={1} dim={1}".format(name, cell_dim))
-        configs.append("# {0}.rp is the output node of this layer:".format(name))
+        configs.append("# {0}.rp is the output node of this layer (if we're not "
+                       "including batchnorm)".format(name))
         configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name))
         configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 "
                        "dim={1}".format(name, rec_proj_dim))
@@ -1158,6 +1187,12 @@ def generate_lstm_config(self):
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
                        "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        if self.layer_type == "fast-lstmp-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, rec_proj_dim + nonrec_proj_dim))
+            configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm "
+                           "input={0}.rp".format(name))
         configs.append("### End LSTM Layer '{0}'".format(name))
 
         return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 7ab70027cef..6fbde1fbbcc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -34,9 +34,12 @@
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
+        'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer,
         'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
+        'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer,
-        'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer,
+        'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer,
+        'lstmb-layer' : xlayers.XconfigLstmbLayer,
         'stats-layer': xlayers.XconfigStatsLayer,
         'relu-conv-layer': xlayers.XconfigConvLayer,
         'conv-layer': xlayers.XconfigConvLayer,
@@ -65,7 +68,9 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
-        'renorm-component': xlayers.XconfigRenormComponent
+        'renorm-component': xlayers.XconfigRenormComponent,
+        'no-op-component': xlayers.XconfigNoOpComponent,
+        'linear-component': xlayers.XconfigLinearComponent
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 80a2b7df418..63f6278d1ca 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -68,3 +68,132 @@ def _generate_config(self):
             self.name, input_desc))
         configs.append(line)
         return configs
+
+
+class XconfigNoOpComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'no-op-component name=renorm input=Append(-3,0,3)'
+    which will produce just a single component, of type NoOpComponent.
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]' }
+
+    def check_configs(self):
+        pass
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+
+        configs = []
+        line = ('component name={0} type=NoOpComponent dim={1}'.format(
+            self.name, input_dim))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
+class XconfigLinearComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'linear-component name=linear1 dim=1024 input=Append(-3,0,3)'
+    which will produce just a single component, of type LinearComponent, with
+    output-dim 1024 in this case, and input-dim determined by the dimention
+    of the input .
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      dim=-1                   [Dimension of the output]
+
+    The following (shown with their effective defaults) are just passed through
+    to the component's config line.
+
+      orthonormal-constraint=-1
+      max-change=0.75
+      l2-regularize=0.0
+
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'dim': -1,
+                       'orthonormal-constraint': '',
+                       'max-change': 0.75,
+                       'l2-regularize': '' }
+
+    def check_configs(self):
+        if self.config['dim'] <= 0:
+            raise RuntimeError("'dim' must be specified and > 0.")
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        assert self.config['dim'] > 0
+        return self.config['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+
+        opts = ''
+        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']:
+            value = self.config[opt_name]
+            if value != '':
+                opts += ' {0}={1}'.format(opt_name, value)
+
+        configs = []
+        line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, input_dim, output_dim, opts))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
index 9ff7f1e2258..08de18167cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
@@ -6,6 +6,7 @@
 # while xconfig_layers.py contains the code specific to layer types.
 
 from __future__ import print_function
+from __future__ import division
 import re
 import sys
 
@@ -277,6 +278,12 @@ def dim(self, layer_to_dim):
             return self.items[0].dim(layer_to_dim)
         elif self.operator == 'Append':
             return sum([ x.dim(layer_to_dim) for x in self.items])
+        elif self.operator == 'Scale':
+            # e.g. Scale(2.0, lstm1).  Return dim of 2nd arg.
+            return self.items[1].dim(layer_to_dim)
+        elif self.operator == 'Const':
+            # e.g. Const(0.5, 512).  Return 2nd arg, which is an int.
+            return self.items[1]
         else:
             raise RuntimeError("Unknown operator {0}".format(self.operator))
 
@@ -312,7 +319,8 @@ def parse_new_descriptor(tokens, pos, prev_names):
 
     # when reading this function, be careful to note the indent level,
     # there is an if-statement within an if-statement.
-    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum',
+                        'Switch', 'Failover', 'IfDefined' ]:
         expect_token('(', tokens[pos], first_token + '()')
         pos += 1
         d.operator = first_token
@@ -392,6 +400,38 @@ def parse_new_descriptor(tokens, pos, prev_names):
             pos += 1
         else:
             raise RuntimeError("code error")
+    elif first_token in ['Scale', 'Const' ]:
+        # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)'
+        expect_token('(', tokens[pos], first_token + '()')
+        pos += 1
+        d.operator = first_token
+        # First arg of Scale() and Const() is a float: the scale or value,
+        # respectively.
+        try:
+            value = float(tokens[pos])
+            pos += 1
+            d.items = [value]
+        except:
+            raise RuntimeError("Parsing {0}, expected float, got {1}".format(
+                first_token, tokens[pos]))
+        # Consume the comma.
+        expect_token(',', tokens[pos], first_token + '()')
+        pos += 1
+        if first_token == 'Scale':
+            # Second arg of Scale() is a Descriptor.
+            (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+            d.items.append(desc)
+        else:
+            assert first_token == 'Const'
+            try:
+                dim = int(tokens[pos])
+                pos += 1
+                d.items.append(dim)
+            except:
+                raise RuntimeError("Parsing Const() expression, expected int, got {0}".format(
+                    tokens[pos]))
+        expect_token(')', tokens[pos], first_token)
+        pos += 1
     elif first_token in [ 'end of string', '(', ')', ',', '@' ]:
         raise RuntimeError("Expected descriptor, got " + first_token)
     elif is_valid_line_name(first_token) or first_token == '[':
@@ -555,7 +595,7 @@ def parse_config_line(orig_config_line):
 
     rest_of_line = ' '.join(fields)
     # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
-    positions = map(lambda x: x.start(), re.finditer('"', rest_of_line))
+    positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line)))
     if not len(positions) % 2 == 0:
         raise RuntimeError("Double-quotes should occur in pairs")
 
@@ -565,7 +605,7 @@ def parse_config_line(orig_config_line):
     # and replace the quotation marks themselves with spaces.
     # Then later on we'll convert all the question marks to
     # equals signs in the values in the dicts.
-    num_strings = len(positions) / 2
+    num_strings = len(positions) // 2
     fields = []
     for i in range(num_strings):
         start = positions[i * 2]
@@ -588,7 +628,7 @@ def parse_config_line(orig_config_line):
     if not (other_fields[0] == '' and len(other_fields) % 2 ==  1):
         raise RuntimeError("Could not parse config line.");
     fields += other_fields[1:]
-    num_variables = len(fields) / 2
+    num_variables = len(fields) // 2
     for i in range(num_variables):
         var_name = fields[i * 2]
         var_value = fields[i * 2 + 1]
@@ -634,6 +674,8 @@ def test_library():
                   ('Append(-3,0,3)',
                    'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'),
                   ('[-1]', 'prev_layer'),
+                  ('Scale(2.0,foo)', 'Scale(2.0, foo)'),
+                  ('Const(0.5,500)', 'Const(0.5, 500)'),
                   ('[-2]', 'last_but_one_layer'),
                   ('[-2]@3',
                    'Offset(last_but_one_layer, 3)') ]:
diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh
index 817f7d1f10b..aaf88cc66d2 100755
--- a/egs/wsj/s5/steps/make_phone_graph.sh
+++ b/egs/wsj/s5/steps/make_phone_graph.sh
@@ -8,6 +8,7 @@
 # is to be used for segmentation, and uses that together with a model to
 # make a decoding graph.
 # Uses SRILM.
+# See also utils/lang/make_phone_bigram_lm.sh.
 
 # Begin configuration section.
 stage=0
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index cec6f8e166f..0294df0d84a 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -150,6 +150,8 @@ if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
   rm $dir/uniq2utt $dir/valid_uttlist.tmp
 fi
 
+echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
+
 cat $data/utt2dur | \
   awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
    utils/filter_scp.pl --exclude $dir/valid_uttlist | \
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6896da67f73..144d29641fd 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -216,15 +216,12 @@ def process_args(args):
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
 
-    if (not os.path.exists(args.dir)
-            or (not os.path.exists(args.dir+"/configs") and
-                (args.input_model is None or not os.path.exists(args.input_model)))):
-        raise Exception("This script expects {0} to exist. Also either "
-                        "--trainer.input-model option as initial 'raw' model "
-                        "(used as 0.raw in the script) should be supplied or "
-                        "{0}/configs directory which is the output of "
-                        "make_configs.py script should be provided."
-                        "".format(args.dir))
+    if (not os.path.exists(args.dir)):
+        raise Exception("This script expects --dir={0} to exist.")
+    if (not os.path.exists(args.dir+"/configs") and
+        (args.input_model is None or not os.path.exists(args.input_model))):
+        raise Exception("Either --trainer.input-model option should be supplied, "
+                        "and exist; or the {0}/configs directory should exist.")
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
@@ -274,6 +271,10 @@ def train(args, run_opts):
     chain_lib.check_for_required_files(args.feat_dir, args.tree_dir,
                                        args.lat_dir)
 
+    # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
+    # use it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)
+
     # Set some variables.
     num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
@@ -505,6 +506,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                 apply_deriv_weights=args.apply_deriv_weights,
@@ -522,7 +524,7 @@ def train(args, run_opts):
                 backstitch_training_interval=args.backstitch_training_interval)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain
+                # do a clean up everything but the last 2 models, under certain
                 # conditions
                 common_train_lib.remove_model(
                     args.dir, iter-2, num_iters, models_to_combine,
@@ -573,8 +575,9 @@ def train(args, run_opts):
             # delete it
             remove_egs = False
 
+        # leave the last-two-numbered models, for diagnostic reasons.
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            args.dir, num_iters - 1, egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
@@ -588,7 +591,7 @@ def train(args, run_opts):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
+    common_lib.execute_command("steps/info/chain_dir_info.pl "
                                "{0}".format(args.dir))
 
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 4ba8cae2d56..f5340fb4611 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -158,6 +158,9 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
+# Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
+# use it to check compatibility between training and decoding phone-sets.
+cp $treedir/phones.txt $dir
 
 # Set some variables.
 nj=`cat $treedir/num_jobs` || exit 1;  # number of jobs in alignment dir...
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 50e02629db0..8c520e0b5e1 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -70,6 +70,8 @@ if [ ! -z "$online_ivector_dir" ]; then
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 fi
 
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index a6dd9682616..c8cbf67c8b8 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -138,6 +138,8 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
+echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
+
 # because we'll need the features with a different number of jobs than $alidir,
 # copy to ark,scp.
 if [ -f $transform_dir/raw_trans.1 ]; then
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 073ad3e7d7a..e21fdb9f43e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -162,6 +162,10 @@ def train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+    # use it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
+
     # Set some variables.
     # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
@@ -328,6 +332,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 momentum=args.momentum,
@@ -365,16 +370,16 @@ def train(args, run_opts):
                 egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                 max_objective_evaluations=args.max_objective_evaluations)
-    
+
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
-        
+
         # If args.do_final_combination is true, we will use the combined model.
         # Otherwise, we will use the last_numbered model.
         real_iter = 'combined' if args.do_final_combination else num_iters
         avg_post_vec_file = train_lib.common.compute_average_posterior(
-            dir=args.dir, iter=real_iter, 
+            dir=args.dir, iter=real_iter,
             egs_dir=egs_dir, num_archives=num_archives,
             prior_subset_size=args.prior_subset_size, run_opts=run_opts)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 2d092ceebc7..d5b37871d70 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -356,6 +356,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 momentum=args.momentum,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index b51632e7d2c..686b76aa7db 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -432,6 +432,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 min_deriv_time=min_deriv_time,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 005e751cae0..1d2135c90c2 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -218,6 +218,10 @@ def train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+    # use it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
+
     # Set some variables.
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
@@ -410,6 +414,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 min_deriv_time=min_deriv_time,
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index fbcf426b205..f023d38b26c 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -148,6 +148,9 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
+# Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+# use it to check compatibility between training and decoding phone-sets.
+cp $alidir/phones.txt $dir
 
 # Set some variables.
 num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
index 50191cf90cb..59ae4a4c994 100755
--- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
+++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
@@ -6,6 +6,11 @@
 #            2015  Hainan Xu
 
 
+# The thing that this script implements is described in the paper:
+# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR"
+# by Guoguo Chen et al, see
+# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf
+
 . ./path.sh || exit 1;
 
 # begin configuration
@@ -73,7 +78,7 @@ fi
 # the cat and awk commands below are implementing add-one smoothing.
 cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \
   awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; }
-       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; 
+       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word];
           print num / den, p } } ' | \
     awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^<eps>' |\
     sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt
@@ -108,6 +113,11 @@ fi
 # Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file
 # exists. The format of $dir/lexiconp_silprob.txt is:
 # word pron-prob P(s_r | w)  F(s_l | w) F(n_l | w) pron
+#  where:  P(s_r | w) is the probability of silence to the right of the word
+#          F(s_l | w) is a factor which is greater than one if silence to the
+#                  left of the word is more than averagely probable.
+#          F(n_l | w) is a factor which is greater than one if nonsilence to the
+#                  left of the word is more than averagely probable.
 if [ -n "$sil_counts" ]; then
   if [ ! -s "$sil_counts" ]; then
     echo "$0: expected file $sil_counts to exist and not empty" && exit 1;
@@ -175,7 +185,7 @@ if [ -n "$sil_counts" ]; then
     # Computes F(s_l | w) and F(n_l | w) in the paper.
     $lambda3 = 2;             # Smoothing term, \lambda_3 in the paper.
     foreach my $wpron (keys %all_wprons) {
-      @col = split(" ", $wpron); 
+      @col = split(" ", $wpron);
       $word = shift @col;
       $pron = join(" ", @col);
       $pron_prob = $all_wprons{$wpron};
@@ -189,7 +199,7 @@ if [ -n "$sil_counts" ]; then
 
       print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n";
     }
-    
+
     # Create silprob.txt
     $BOS_sil_count = $wpron_sil{"<s>"} + $sil_prob * $lambda2;
     $BOS_nonsil_count = $wpron_nonsil{"<s>"} + (1 - $sil_prob) * $lambda2;
@@ -206,7 +216,7 @@ if [ -n "$sil_counts" ]; then
 fi
 
 # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
-# in the same order. 
+# in the same order.
 cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/  / /g' >$dir/lexicon.txt
 
 
diff --git a/egs/wsj/s5/utils/lang/check_phones_compatible.sh b/egs/wsj/s5/utils/lang/check_phones_compatible.sh
index 18301a900c5..cfad06d2b8c 100755
--- a/egs/wsj/s5/utils/lang/check_phones_compatible.sh
+++ b/egs/wsj/s5/utils/lang/check_phones_compatible.sh
@@ -18,11 +18,8 @@
 # except for possible differences in disambiguation symbols (meaning that all
 # symbols except those beginning with a # are mapped to the same values).
 # Otherwise it prints a warning and exits with status 1.
-# For the sake of compatibility with other scripts that did not write the 
-# phones.txt to model directories, this script exits silently with status 0 
-# if one of the phone symbol tables does not exist.
-# For the sake of compatibility with other scripts that did not write the 
-# phones.txt to model directories, this script exits silently with status 0 
+# For the sake of compatibility with other scripts that did not write the
+# phones.txt to model directories, this script exits silently with status 0
 # if one of the phone symbol tables does not exist.
 
 . utils/parse_options.sh || exit 1;
@@ -36,24 +33,24 @@ fi
 table_first=$1
 table_second=$2
 
-# check the files exist or not 
+# check if the files exist or not
 if [ ! -f $table_first ]; then
   if [ ! -f $table_second ]; then
     echo "$0: Error! Both of the two phones-symbol tables are absent."
     echo "Please check your command"
     exit 1;
   else
-    #The phones-symbol-table1 is absent. The model directory maybe created by old script.
-    #For back compatibility, this script exits silently with status 0.
+    # The phones-symbol-table1 is absent. The model directory maybe created by old script.
+    # For back compatibility, this script exits silently with status 0.
     exit 0;
   fi
 elif [ ! -f $table_second ]; then
-  #The phones-symbol-table2 is absent. The model directory maybe created by old script.
-  #For back compatibility, this script exits silently with status 0.
+  # The phones-symbol-table2 is absent. The model directory maybe created by old script.
+  # For back compatibility, this script exits silently with status 0.
   exit 0;
 fi
 
-#Check the two tables are same or not (except for possible difference in disambiguation symbols).
+# Check if the two tables are the same (except for possible difference in disambiguation symbols).
 if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then
   echo "$0: phone symbol tables $table_first and $table_second are not compatible."
   exit 1;
diff --git a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
index dcb77bb1342..1d3d04896b4 100755
--- a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
+++ b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
@@ -9,10 +9,10 @@
 # is to limit the number of transitions, so we can decode reasonably fast, and the
 # graph won't blow up.  This is probably going to be most useful for things like
 # language-id.
+#
+#  See also steps/make_phone_graph.sh
 
 
-# We might later have options here; if not, I'll emove this.
-
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
diff --git a/src/.version b/src/.version
index d346e2ab7f2..37c2d9960ec 100644
--- a/src/.version
+++ b/src/.version
@@ -1 +1 @@
-5.3
+5.4
diff --git a/src/INSTALL b/src/INSTALL
index f40a514c4b6..d794cab67ee 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -9,21 +9,12 @@ You must first have completed the installation steps in ../tools/INSTALL
 The installation instructions are
 
   ./configure --shared
-  make depend
-  make
-
-Note that "make" takes a long time. You can speed it up by running make
-in parallel if you have multiple CPUs, e.g. to use 8 CPUs
-
   make depend -j 8
   make -j 8
 
-Kaldi requires a relatively recent C++ compiler with C++11 support,
-e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
-default compiler does not support C++11, you can specify a C++11 compliant
-compiler by setting the CXX environment variable, e.g.
-
-  CXX=g++-4.8 ./configure --shared
+Note that we added the "-j 8" to run in parallel because "make" takes a long
+time.  8 jobs might be too many for a laptop or small desktop machine with not
+many cores.
 
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 2b27d4b9176..620ea873eb7 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -260,7 +260,7 @@ bool DenominatorComputation::Backward(
       BetaGeneralFrameDebug(t);
     Beta(t);
     if (t % kMaxDerivTimeSteps == 0) {
-      // commit the derivative stored in exp_nnet_output_transposed_ by adding
+      // commit the derivative stored in nnet_output_deriv_transposed_ by adding
       // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'.
       int32 chunk_frames = std::min<int32>(static_cast<int32>(kMaxDerivTimeSteps),
                                            frames_per_sequence_ - t),
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index a4a417c8a5d..f44588e434f 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -51,7 +51,7 @@ namespace chain {
 
   All this is done in parallel over multiple sequences, but the computations
   are independent over the separate sequences, so we won't introduce any notation
-  or index for the sequence; we'll just explain it for one sequences.
+  or index for the sequence; we'll just explain it for one sequence.
 
   Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
   hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
@@ -313,4 +313,3 @@ class DenominatorComputation {
 }  // namespace kaldi
 
 #endif  // KALDI_CHAIN_CHAIN_DENOMINATOR_H_
-
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index 7bf3c17854a..d14c80cd84f 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -607,8 +607,8 @@ void TestRanges() {
 int main() {
   using namespace kaldi;
   SetVerboseLevel(1);
-  int32 loop = 0;
 #if HAVE_CUDA == 1
+  int32 loop = 0;
   for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 53de69a0e07..bf61bed67f0 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -33,38 +33,45 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrixBase<BaseFloat> *xent_output_deriv) {
-  BaseFloat num_logprob_weighted;
-  if (nnet_output_deriv)
+                              CuMatrix<BaseFloat> *xent_output_deriv) {
+  BaseFloat num_logprob_weighted, den_logprob_weighted;
+  bool ok = true;
+  if (nnet_output_deriv != NULL)
     nnet_output_deriv->SetZero();
+
+  { // Doing the denominator first helps to reduce the maximum
+    // memory use, as we can set 'xent_deriv' to nonempty after
+    // we've freed the memory in this object.
+    DenominatorComputation denominator(opts, den_graph,
+                                       supervision.num_sequences,
+                                       nnet_output);
+
+    den_logprob_weighted = supervision.weight * denominator.Forward();
+    if (nnet_output_deriv)
+      ok = denominator.Backward(-supervision.weight,
+                                nnet_output_deriv);
+  }
+
+  if (xent_output_deriv != NULL)
+    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+
+
   {
     NumeratorComputation numerator(supervision, nnet_output);
     // note: supervision.weight is included as a factor in the derivative from
-    // the numerator object, and the logprob too.
+    // the numerator object, as well as the returned logprob.
     num_logprob_weighted = numerator.Forward();
-    if (nnet_output_deriv) {
-      numerator.Backward(nnet_output_deriv);
-      if (xent_output_deriv)
-        xent_output_deriv->CopyFromMat(*nnet_output_deriv);
-    } else if (xent_output_deriv) {
-      // this branch will be taken if xent_output_deriv but not
-      // nnet_output_deriv is set- which could happen if you want to compute the
-      // cross-entropy objective but not the derivatives.
-      xent_output_deriv->SetZero();
+
+    if (xent_output_deriv) {
       numerator.Backward(xent_output_deriv);
+      if (nnet_output_deriv)
+        nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
+    } else if (nnet_output_deriv) {
+      numerator.Backward(nnet_output_deriv);
     }
   }
-  DenominatorComputation denominator(opts, den_graph,
-                                     supervision.num_sequences,
-                                     nnet_output);
-
-  BaseFloat den_logprob = denominator.Forward();
-  bool ok = true;
-  if (nnet_output_deriv)
-    ok = denominator.Backward(-supervision.weight,
-                              nnet_output_deriv);
 
-  *objf = num_logprob_weighted - supervision.weight * den_logprob;
+  *objf = num_logprob_weighted - den_logprob_weighted;
   *weight = supervision.weight * supervision.num_sequences *
       supervision.frames_per_sequence;
   if (!((*objf) - (*objf) == 0) || !ok) {
@@ -86,7 +93,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
   // for different frames of the sequences.  As expected, they are
   // smaller towards the edges of the sequences (due to the penalization
   // of 'incorrect' pdf-ids.
-  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) {
+  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) {
     int32 tot_frames = nnet_output_deriv->NumRows(),
  frames_per_sequence = supervision.frames_per_sequence,
        num_sequences = supervision.num_sequences;
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index e6143d10846..d6535902625 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -63,7 +63,7 @@ struct ChainTrainingOptions {
 
   ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
@@ -107,10 +107,13 @@ struct ChainTrainingOptions {
                            You don't have to zero this before passing to this function,
                            we zero it internally.
    @param [out] xent_output_deriv  If non-NULL, then the numerator part of the derivative
-                           (which equals a posterior from the numerator forward-backward,
-                           scaled by the supervision weight) is written to here.  This will
-                           be used in the cross-entropy regularization code.  This value
-                           is also used in computing the cross-entropy objective value.
+                           (which equals a posterior from the numerator
+                           forward-backward, scaled by the supervision weight)
+                           is written to here (this function will set it to the
+                           correct size first; doing it this way reduces the
+                           peak memory use).  xent_output_deriv will be used in
+                           the cross-entropy regularization code; it is also
+                           used in computing the cross-entropy objective value.
 */
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
@@ -120,12 +123,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
-                              
+                              CuMatrix<BaseFloat> *xent_output_deriv = NULL);
+
 
 
 }  // namespace chain
 }  // namespace kaldi
 
 #endif  // KALDI_CHAIN_CHAIN_TRAINING_H_
-
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index e6ade23728f..ca831390ea9 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -7,11 +7,11 @@ LDLIBS += $(CUDA_LDLIBS)
 
 TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
             cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \
-	    cu-sparse-matrix-test cu-device-test cu-rand-speed-test
+	    cu-sparse-matrix-test cu-device-test cu-rand-speed-test cu-compressed-matrix-test
 
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
-           cu-sparse-matrix.o cu-allocator.o cu-array.o
+           cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
 ifeq ($(CUDA), true)
   OBJFILES += cu-kernels.o
 endif
@@ -33,4 +33,3 @@ endif
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index f2ccf0d6c29..0f96315e848 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -54,7 +54,7 @@ struct CuAllocatorOptions {
   // is a constant overhead proportional to the number of buckets.
   BaseFloat delete_factor;
 
-  CuAllocatorOptions(): memory_factor(1.5),
+  CuAllocatorOptions(): memory_factor(1.3),
                         delete_factor(0.001) { }
 
   void Check() {
diff --git a/src/cudamatrix/cu-compressed-matrix-test.cc b/src/cudamatrix/cu-compressed-matrix-test.cc
new file mode 100644
index 00000000000..3cbd7bd5060
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix-test.cc
@@ -0,0 +1,179 @@
+// cudamatrix/cu-compressed-matrix-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+void CuCompressedMatrixTestSign() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandn();
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(kCompressedMatrixUint8, 0.0);
+
+  // this just stores (M(i, j) > 0 ? 1 : 0).
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+  M.Heaviside(M);
+
+  AssertEqual(M, M2);
+  delete cm;
+}
+
+void CuCompressedMatrixTestNonnegative() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandUniform();
+
+  BaseFloat range = 0.5 * RandInt(1, 5);
+  M.Scale(range);
+
+  CuCompressedMatrixType t = (RandInt(0, 1) == 0 ?
+                              kCompressedMatrixUint8 :
+                              kCompressedMatrixUint16);
+
+  // since the input is in the correct range, truncating or not should make no
+  // difference.
+  bool truncate = (RandInt(0, 1) == 0);
+
+  BaseFloat extra_error = 0.0;
+  if (truncate && (RandInt(0, 1) == 0)) {
+    // this tests that with truncate == true, adding a small offset, which would
+    // take us outside the representable range, will not add too much extra
+    // error.  (with truncate == false this would not be true because we wouldn't
+    // round to the edges of the range, it would wrap around).
+    extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0);
+    M.Add(extra_error);
+  }
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate);
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+
+  M2.AddMat(-1.0, M);
+
+  BaseFloat diff_max = M2.Max(),
+      diff_min = M2.Min();
+
+  BaseFloat
+      headroom = 1.1,
+      max_expected_error = fabs(extra_error) + headroom * 0.5 *
+         range / (t == kCompressedMatrixUint8 ? 255 : 65535);
+
+  KALDI_ASSERT(diff_max < max_expected_error &&
+               diff_min > -1.0 * max_expected_error);
+
+  delete cm;
+}
+
+// this is like CuCompressedMatrixTestNonnegative but
+// with signed integers, and input in the range [-range, +range].
+void CuCompressedMatrixTestSymmetric() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandUniform();
+  M.Scale(2.0);
+  M.Add(-1.0);
+
+  BaseFloat range = 0.5 * RandInt(1, 5);
+  M.Scale(range);
+
+  CuCompressedMatrixType t = (RandInt(0, 1) == 0 ?
+                              kCompressedMatrixInt8 :
+                              kCompressedMatrixInt16);
+
+  // since the input is in the correct range, truncating or not should make no
+  // difference.
+  bool truncate = (RandInt(0, 1) == 0);
+
+  BaseFloat extra_error = 0.0;
+  if (truncate && (RandInt(0, 1) == 0)) {
+    // this tests that with truncate == true, adding a small offset, which would
+    // take us outside the representable range, will not add too much extra
+    // error.  (with truncate == false this would not be true because we wouldn't
+    // round to the edges of the range, it would wrap around).
+    extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0);
+    M.Add(extra_error);
+  }
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate);
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+
+  M2.AddMat(-1.0, M);
+
+  BaseFloat diff_max = M2.Max(),
+      diff_min = M2.Min();
+
+  BaseFloat
+      headroom = 1.1,
+      max_expected_error = fabs(extra_error) + headroom * 0.5 *
+         range / (t == kCompressedMatrixInt8 ? 127 : 32767);
+
+  KALDI_ASSERT(diff_max < max_expected_error &&
+               diff_min > -1.0 * max_expected_error);
+
+  delete cm;
+}
+
+
+
+} // namespace kaldi
+
+
+int main() {
+  SetVerboseLevel(1);
+  // we don't run this test if CUDA is not compiled in, since
+  // you can't instantiate class CuCompressedMatrix in that case.
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().SelectGpuId("yes");
+  for (int32 i = 1; i < 10; i++) {
+    CuCompressedMatrixTestSign();
+    CuCompressedMatrixTestNonnegative();
+    CuCompressedMatrixTestSymmetric();
+  }
+
+#endif
+  return 0;
+}
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
new file mode 100644
index 00000000000..be02921169d
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -0,0 +1,142 @@
+// cudamatrix/cu-compressed-matrix.cc
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
+#include "base/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#include "cudamatrix/cu-array.h"
+#include "cudamatrix/cu-compressed-matrix.h"
+
+namespace kaldi {
+
+
+template <typename I>
+CuCompressedMatrix<I>::CuCompressedMatrix(BaseFloat range, bool truncate):
+    data_(NULL), scale_(range / std::numeric_limits<I>::max()),
+    truncate_(truncate), num_rows_(0), num_cols_(0), stride_(0) {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+#else
+  KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use "
+      "was not compiled in.";
+#endif
+}
+
+template <typename I>
+void CuCompressedMatrix<I>::Destroy() {
+#if HAVE_CUDA == 1
+  if (data_ != NULL) {
+    // we don't bother timing this because Free() won't normally have to
+    // access the GPU at all (due to caching).
+    CuDevice::Instantiate().Free(data_);
+    data_ = NULL;
+    num_rows_ = 0;
+    num_cols_ = 0;
+    stride_ = 0;
+  }
+#endif
+}
+
+template <typename I>
+void CuCompressedMatrix<I>::CopyFromMat(
+    const CuMatrixBase<BaseFloat> &mat) {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+  if (mat.NumRows() == 0)
+    return;
+  if (num_rows_ != mat.NumRows() || num_cols_ != mat.NumCols()) {
+    Destroy();
+    num_rows_ = mat.NumRows();
+    num_cols_ = mat.NumCols();
+    data_ = static_cast<I*>(
+        CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_));
+    stride_ = num_cols_;
+  }
+
+  {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+
+    if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others.
+      cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                             data_, stride_);
+    } else {
+      cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                        data_, stride_, float(1.0 / scale_),
+                        truncate_);
+    }
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  }
+#endif
+}
+
+template <typename I>
+void CuCompressedMatrix<I>::CopyToMat(CuMatrixBase<BaseFloat> *mat) const {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+  KALDI_ASSERT(mat->NumRows() == num_rows_ && mat->NumCols() == num_cols_);
+  {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    BaseFloat scale = (scale_ == 0.0 ? 1.0 : scale_);
+    cuda_mat_uncompress(dimGrid, dimBlock, mat->Data(), mat->Dim(),
+                        data_, stride_, float(scale));
+  }
+#endif
+}
+
+
+CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
+                                              BaseFloat range,
+                                              bool truncat) {
+  if (t == kCompressedMatrixUint8) {
+    KALDI_ASSERT(range >= 0);
+    return new CuCompressedMatrix<uint8>(range);
+  } else if (t == kCompressedMatrixInt8) {
+    KALDI_ASSERT(range >= 0);
+    return new CuCompressedMatrix<int8>(range);
+  } else if (t == kCompressedMatrixUint16) {
+    KALDI_ASSERT(range > 0);
+    return new CuCompressedMatrix<uint16>(range);
+  } else if (t == kCompressedMatrixInt16) {
+    KALDI_ASSERT(range > 0);
+    return new CuCompressedMatrix<int16>(range);
+  } else {
+    KALDI_ERR << "Unknown compressed-matrix type";
+    return NULL;
+  }
+}
+
+
+
+} // namespace kaldi
diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h
new file mode 100644
index 00000000000..1ef7853b906
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix.h
@@ -0,0 +1,162 @@
+// cudamatrix/cu-compressed-matrix.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_
+
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+/**
+   Class CuCompressedMatrixBase is an abstract base class that allows you to
+   compress a matrix of type CuMatrix<BaseFloat>.  When you instantiate it you
+   would choose the child-class type (by allocating the appropriate child-class
+   type via 'new').
+ */
+class CuCompressedMatrixBase {
+ public:
+
+  /// Sets *this to an appropriately compressed copy of 'mat', which
+  /// includes resizing *this.  The details of how this is done will be
+  /// different in different child classes.
+  virtual void CopyFromMat(const CuMatrixBase<BaseFloat> &mat) = 0;
+
+  /// Copies the contents of *this to 'mat', which should be
+  /// correctly sized beforehand.
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) const = 0;
+
+
+  // The number of rows in *this.
+  virtual int32 NumRows() const = 0;
+
+  // The number of columns in *this.
+  virtual int32 NumCols() const = 0;
+
+  virtual ~CuCompressedMatrixBase() { }
+};
+
+
+
+/**
+   Class CuCompressedMatrix, templated on an integer type (expected to be one
+   of: int8, uint8, int16, uint16), this provides a way to approximate a
+   CuMatrix in a more memory-efficient format.  It's used in nnet3 to
+   reduce memory use for large networks.
+
+   It is *not* a CUDA equivalent for class CompressedMatrix (of
+   ../matrix/compressed-matrix.h).  Note: this class is only to be used when you
+   are using a GPU.  If you didn't compile for CUDA or you are not using a GPU,
+   you are not supposed to create an instance of this class, and doing so will
+   cause a runtime error.
+ */
+template <typename I>
+class CuCompressedMatrix: public CuCompressedMatrixBase {
+ public:
+
+  /// Constructor which sets 'scale_' according to
+  /// scale_ = range / std::numeric_limits<I>::max().
+  ///
+  /// range = 0 (only supported for I == int8) is a special case in which only
+  /// the sign of the input is retained; and when we reconstruct, the output
+  /// will be -1, 0 or 1.
+  ///
+  /// truncate (only relevant if range != 0) should be true if it's possible
+  /// that the input could exceed the allowed input range, i.e. [0, range] if I
+  /// is unsigned, and [-range, range] if I is signed; and it may be false if
+  /// you know that the input (the matrix given to CopyFromMat) will have
+  /// elements only in the allowed range.  Setting 'truncate' to false
+  /// allows the compression code to avoid the bounds check.
+  CuCompressedMatrix(BaseFloat range, bool truncate = true);
+
+  virtual void CopyFromMat(const CuMatrixBase<BaseFloat> &mat);
+
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) const;
+
+  virtual MatrixIndexT NumRows() const { return num_rows_; }
+
+  virtual MatrixIndexT NumCols() const { return num_cols_; }
+
+
+  virtual ~CuCompressedMatrix() { Destroy(); }
+
+ private:
+  // If there was data in 'data_', frees it, and sets it to NULL.
+  void Destroy();
+
+  // The raw data.
+  I *data_;
+
+  // scale_ affects how the raw data is interpreted as a floating point value.
+  // When uncompressing to a CuMatrix, we'll do:
+  //  f  = scale_ * i
+  // where f is the floating point value we're writing to, and i is the integer
+  // value.
+  //
+  // scale_ = 0 is treated specially; in this case we just take notice of the
+  // sign of the input, and when uncompressing we do it with a scale such
+  // that the output becomes -1, 0 and 1.
+  BaseFloat scale_;
+
+  // 'truncate_' affects the code that compresses data to integer values.
+  // If the data we're compressing might possibly be outside of the representable
+  // range, then you should set truncate to true (this is the default in the
+  // constructor).  This way, values larger than the minimum or maximum will
+  // be set to the minimum or maximum value.  If truncate_ is false, it will
+  // just wrap around, but the compression code will be slightly faster as
+  // it doesn't need to check.
+  bool truncate_;
+
+  MatrixIndexT num_rows_;
+  MatrixIndexT num_cols_;
+  // stride_ is currently always equal to num_cols_; it was added mainly to
+  // point the way to possible future extension.
+  MatrixIndexT stride_;
+};
+
+
+
+// This enum value is used to encode the type you want to instantiate
+// a CuCompressedMatrix with.  It's used in class NnetComputation
+// (cast to int32) as one of the arguments of kCompressMatrix.
+enum  CuCompressedMatrixType {
+  kCompressedMatrixInt8 = 1,
+  kCompressedMatrixUint8 = 2,
+  kCompressedMatrixInt16 = 3,
+  kCompressedMatrixUint16 = 4
+};
+
+/**
+   This function allocates a new CuCompressedMatrix with type determined
+   by t, and with the 'range' and 'truncate' parameters provided to the
+   constructor of class CuCompressedMatrix.
+
+   It will crash at runtime if called when CUDA is not compiled in, or not
+   enabled.
+ */
+CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
+                                              BaseFloat range,
+                                              bool truncate = true);
+
+
+} // namespace kaldi
+
+#endif
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 9b0976b05ad..87e266e1889 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -58,6 +58,15 @@ namespace kaldi {
 */
 
 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
+
+  // Our first attempt to get a device context is: we do cudaFree(0) and see if
+  // that returns no error code.  If it succeeds then we have a device
+  // context.  Apparently this is the canonical way to get a context.
+  if (cudaFree(0) == 0)
+    return true;
+
+  // The rest of this code represents how we used to get a device context, but
+  // now its purpose is mainly a debugging one.
   std::ostringstream debug_stream;
   debug_stream << "num-gpus=" << num_gpus << ". ";
   for (int32 device = 0; device < num_gpus; device++) {
@@ -220,9 +229,9 @@ void CuDevice::FinalizeActiveGpu() {
     }
     // Remember the id of active GPU
     active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
-    // Initialize the CUBLAS
+    // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&handle_));
-    // Initialize the cuSPARSE
+    // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
 
     // Notify user which GPU is finally used
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 7d2db9adcc9..8ab03c7e14e 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -30,6 +30,15 @@
 #if HAVE_CUDA == 1
 extern "C" {
 
+// "C" version of the BaseFloat typedef-- this saves us having to write
+// multiple versions of these kernels.
+#if (KALDI_DOUBLEPRECISION != 0)
+typedef double  BaseFloat;
+#else
+typedef float   BaseFloat;
+#endif
+
+
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta);
@@ -736,6 +745,42 @@ void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
 void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
 void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
+
+void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, int16_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, uint16_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, uint8_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check);
+
+void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint8_t *dest, int dest_stride);
+
+void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int16_t *src,
+                           int src_stride, float scale);
+void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                            MatrixDim dim, const uint16_t *src,
+                            int src_stride, float scale);
+void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                          MatrixDim dim, const int8_t *src,
+                          int src_stride, float scale);
+void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                          MatrixDim dim, const uint8_t *src,
+                          int src_stride, float scale);
+
+
+
 } // extern "C"
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 2f8f37224be..ae7e25b716d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3558,6 +3558,104 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m
   }
 }
 
+
+__global__
+static void _cuda_compress_uint8_sign(const BaseFloat *src, MatrixDim dim,
+                                      unsigned char *dest, int dest_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  if (i < dim.cols && j < dim.rows) {
+    BaseFloat f = src[src_index];
+    dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0);
+  }
+}
+
+
+// The following inline templated functions are a workaround for the
+// fact that (I believe) std::numeric_limits is not available in CUDA;
+// they allow us to access the minimum and maximum elements of certain
+// types from templated code.
+template <typename I> __device__ static inline int minimum_integer_value();
+template <typename I> __device__ static inline int maximum_integer_value();
+
+template<> __device__ int maximum_integer_value<int8_t>() { return 127; }
+template<> __device__ int minimum_integer_value<int8_t>() { return -128; }
+template<> __device__ int maximum_integer_value<uint8_t>() { return 255; }
+template<> __device__ int minimum_integer_value<uint8_t>() { return 0; }
+template<> __device__ int maximum_integer_value<int16_t>() { return 32767; }
+template<> __device__ int minimum_integer_value<int16_t>() { return -32768; }
+template<> __device__ int maximum_integer_value<uint16_t>() { return 65535; }
+template<> __device__ int minimum_integer_value<uint16_t>() { return 0; }
+
+
+
+template <typename I>
+__global__
+static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
+                                        I *dest, int dest_stride, float inv_scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  const int min_value = minimum_integer_value<I>(),
+      max_value = maximum_integer_value<I>();
+  int compressed_value;
+  int ok = (i < dim.cols && j < dim.rows);
+  if  (ok) {
+    float f = src[src_index];
+    // note: I'm not sure what __float2int_rn does if input is outside of
+    // integer range, but it doesn't matter much as in the situations where this
+    // type of compression would make sense, the input should be well inside the
+    // range of 'int', and if it fails, we've probably already catastrophically
+    // diverged.
+    int i = __float2int_rn(f * inv_scale);
+    if (i < min_value) compressed_value = min_value;
+    else if (i > max_value) compressed_value = max_value;
+    else compressed_value = i;
+  }
+  __syncthreads();
+  if (ok) {
+    dest[dest_index] = compressed_value;
+  }
+}
+
+
+template <typename I>
+__global__
+static void _cuda_compress_no_bounds_check(const BaseFloat *src, MatrixDim dim,
+                                           I *dest, int dest_stride,
+                                           float inv_scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  if (i < dim.cols && j < dim.rows) {
+    float f = src[src_index];
+    int i = __float2int_rn(f * inv_scale);
+    I s = i;
+    dest[dest_index] = s;
+  }
+}
+
+template <typename I>
+__global__
+static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
+                             const I *src, int src_stride,
+                             float scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int src_index = i + j * src_stride,
+      dest_index = i + j * dim.stride;
+  if (i < dim.cols && j < dim.rows) {
+    I s = src[src_index];
+    dest[dest_index] = float(s * scale);
+  }
+}
+
+
+
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
  */
@@ -5220,3 +5318,69 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
   _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
 }
 
+void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
+                              unsigned char *dest, int dest_stride) {
+  _cuda_compress_uint8_sign<<<Gr, Bl>>>(src, dim, dest, dest_stride);
+}
+
+void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int16_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, uint16_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, uint8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+
+void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const uint8_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int8_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                            MatrixDim dim, const uint16_t *src,
+                            int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int16_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 27ccf760557..3518e0c71ed 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1463,6 +1463,73 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
   cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
 }
 
+// Compresses the matrix in 'src' to 'dest', retaining only zero-one
+// information (1 if the value is >0, 0 otherwise)
+inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                                   MatrixDim dim, uint8 *dest,
+                                   int dest_stride) {
+  cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride);
+}
+// this template handles the other types that are not instantiated yet,
+// to avoid compilation errors.
+template <typename I>
+inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                                   MatrixDim dim, I *dest,
+                                   int dest_stride) {
+  KALDI_ERR << "Not implemented for this type.";
+}
+
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, int16_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride,
+                      inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint16_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride,
+                       inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint8_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride,
+                      inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, int8_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride,
+                     inv_scale, bounds_check);
+}
+
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const int8_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const uint8_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const int16_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const uint16_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+
+
 } // namespace kaldi
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h
index ef21a2945f1..1da7efafc97 100644
--- a/src/cudamatrix/cu-matrix-lib.h
+++ b/src/cudamatrix/cu-matrix-lib.h
@@ -29,5 +29,6 @@
 #include "cudamatrix/cu-sparse-matrix.h"
 #include "cudamatrix/cu-block-matrix.h"
 #include "cudamatrix/cu-rand.h"
+#include "cudamatrix/cu-compressed-matrix.h"
 
 #endif
diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh
index 422798905f5..c11fb7f805e 100755
--- a/src/doc/get_version_info.sh
+++ b/src/doc/get_version_info.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# search for VERSIONS below to see how to change this when
+# search for VERSION below to see how to change this when
 # Kaldi's version number increases.
 
 # Note: this script assumes that it's part of a git repository where
@@ -42,7 +42,8 @@ fi
 # Note: when you add new tuples here you'll also want to add ndew
 # \htmlinclude directives in versions.dox.
 
-for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 master 131cdd4cb544"; do
+for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 5.3 131cdd4cb544" \
+              "5.4 master be969d7baf04"; do
   major_minor_number=$(echo $tuple | awk '{print $1}')  # e.g. 5.0
   branch=$(echo $tuple | awk '{print $2}')  # e.g. 'master', or '5.1' (it's a branch name)
   first_commit=$(echo $tuple | awk '{print $3}')
diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox
index 714d9de6f2e..0b222ec5f1a 100644
--- a/src/doc/lattices.dox
+++ b/src/doc/lattices.dox
@@ -264,8 +264,10 @@ has the same effect as calling that the normal OpenFst RemoveEps() and Determini
 
  \section lattices_generation Lattice generation
 
-Currently, the only decoder that generates lattices is the class
-LatticeSimpleDecoder, defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc.
+Command-line decoding programs that have 'latgen' in their names generate lattices.
+Currently most of these use LatticeFasterDecoder.  For purposes of exposition we will
+focus instead on LatticeSimpleDecoder, whose operation is simpler.
+This is defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc.
 As the name suggests, LatticeSimpleDecoder is a lattice-generating decoder that is modified from SimpleDecoder.
 SimpleDecoder is a straightforwardly implemented Viterbi beam search algorithm with only a single
 tunable parameter: the pruning beam (see \ref decoders_simple).  LatticeSimpleDecoder has
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index 9461ef1e873..d12b8621ccd 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -116,7 +116,7 @@
 
  \subsection versions_versions_53 Version 5.3
 
-   Version 5.3 is the current master branch.  Major changes that were made between the end of 5.2.x
+   Major changes that were made between the end of 5.2.x
    and the start of the 5.3 branch include:
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
@@ -127,4 +127,24 @@
 
    \htmlinclude 5.3.html
 
+ \subsection versions_versions_54 Version 5.4
+
+
+   Version 5.4 is the current master branch.   The main changes that were made between
+   the end of 5.3.x and the start of the 5.4 branch include:
+    - Some code changes in the nnet3 codebase, for speed and memory efficiency.
+    - Various simplifications and code reorganizations in the nnet3 code.
+    - Support for a new kind of factorized TDNN which gives substantially better
+      results than our old TDNN recipe, and is even better than our old TDNN+LSTM
+      recipe.  A good example of this is in egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh.
+      Some nnet3 code changes were needed for this as well (mostly: support for constraining
+      a matrix to have orthonormal rows).
+
+   Below are patches corresponding to minor version numbers 5.4.x.
+
+   \htmlinclude 5.4.html
+
+
+
+
 */
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 8ddba56b0e0..df0fb2d4502 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-common-test convolution-test attention-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
-  nnet-simple-component.o \
+  nnet-simple-component.o nnet-normalize-component.o \
   nnet-general-component.o nnet-parse.o natural-gradient-online.o \
   nnet-descriptor.o nnet-optimize.o nnet-computation.o \
   nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \
diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index 88a14616f9d..445cc43f868 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -271,7 +271,7 @@ void UnitTestPreconditionDirectionsOnline() {
   if (Rand() % 3 == 0) zero = true;
   //else if (Rand() % 2 == 0) one = true;
 
-  CuVector<BaseFloat> row_prod1(N), row_prod2(N);
+  CuVector<BaseFloat> row_prod1(N);
   BaseFloat gamma1, gamma2;
   BaseFloat big_eig_factor = RandInt(1, 20);
   big_eig_factor = big_eig_factor * big_eig_factor;
@@ -301,14 +301,13 @@ void UnitTestPreconditionDirectionsOnline() {
 
     preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
 
-    preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
+    preconditioner2.PreconditionDirections(&Mcopy2, &gamma2);
 
     BaseFloat trace1 = TraceMatMat(M, M, kTrans),
         trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 5ef413b9f38..b5740053f46 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -18,6 +18,7 @@
 // limitations under the License.
 
 #include "nnet3/natural-gradient-online.h"
+#include "nnet3/nnet-parse.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -26,8 +27,8 @@ namespace nnet3 {
 OnlineNaturalGradient::OnlineNaturalGradient():
     rank_(40), update_period_(1), num_samples_history_(2000.0),
     num_minibatches_history_(0.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
+    epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
+    self_debug_(false) { }
 
 
 /**
@@ -123,6 +124,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
+  this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
   CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
@@ -146,52 +148,53 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
     R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
+    this_copy.PreconditionDirections(&R0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
   d_t_.Swap(&this_copy.d_t_);
   rho_t_ = this_copy.rho_t_;
-  t_ = 0;
 }
 
 void OnlineNaturalGradient::PreconditionDirections(
     CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
   if (X_t->NumCols() == 1) {
     // If the dimension of the space equals one then our natural gradient update
     // with rescaling becomes a no-op, but the code wouldn't naturally handle it
     // because rank would be zero.  Support this as a special case.
-    if (row_prod)
-      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    *scale = 1.0;
-    return;
-  }
-
-  if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
-    PreconditionDirections(X_t, &row_prod_tmp, scale);
+    if (scale)
+      *scale = 1.0;
     return;
   }
 
-  read_write_mutex_.lock();
-  if (t_ == -1) // not initialized
+  if (t_ == 0) // not initialized
     Init(*X_t);
 
-  // Now t_ >= 0.
-  // We create local copies  of the class variables... this is intended for
-  // multi-threaded safety so we can't read them in an inconsistent state,
-  // but we don't really waste anything here (a copy of W_t is needed anyway,
-  // if we're to update it).
-  int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols();
+  int32 R = W_t_.NumRows(), D = W_t_.NumCols();
   // space for W_t, J_t, K_t, L_t.
   CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
   WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
   BaseFloat rho_t(rho_t_);
   Vector<BaseFloat> d_t(d_t_);
-  read_write_mutex_.unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
+
+  bool updating = Updating();
+
+  BaseFloat initial_product;
+  initial_product = TraceMatMat(*X_t, *X_t, kTrans);
+
+  PreconditionDirectionsInternal(rho_t, initial_product,
+                                 updating, d_t, &WJKL_t, X_t);
+
+  if (scale) {
+    if (initial_product <= 0.0) {
+      *scale = 1.0;
+    } else {
+      BaseFloat final_product = TraceMatMat(*X_t, *X_t, kTrans);
+      *scale = sqrt(initial_product / final_product);
+    }
+  }
+  t_ += 1;
 }
 
 void OnlineNaturalGradient::ReorthogonalizeXt1(
@@ -318,13 +321,12 @@ void OnlineNaturalGradient::SelfTest() const {
 }
 
 void OnlineNaturalGradient::PreconditionDirectionsInternal(
-    const int32 t,
     const BaseFloat rho_t,
+    const BaseFloat tr_X_Xt,
+    bool updating,
     const Vector<BaseFloat> &d_t,
     CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
+    CuMatrixBase<BaseFloat> *X_t) {
   int32 N = X_t->NumRows(),  // Minibatch size.
       D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
       R = rank_;  // Rank of correction to unit matrix.
@@ -343,57 +345,11 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
 
   H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
 
-  bool locked = update_mutex_.try_lock();
-  if (locked) {
-    // We'll release the lock if we don't plan to update the parameters.
-
-    // Explanation of the conditions below:
-    // if (frozen_) because we don't do the update is the user called Freeze().
-    // I forget why the (t_ > t) is here; probably some race condition encountered
-    //   a long time ago.  Not important; nnet3 doesn't use multiple threads anyway.
-    // The condition:
-    // (num_updates_skipped_ < update_period_ - 1 && t_ >= num_initial_updates)
-    // means we can update if either we're in the first 10 updates (e.g. first
-    // 10 minibatches), or if we've skipped 'update_period_ - 1' batches of data
-    // without updating the parameters (this allows us to update only, say,
-    // every 4 times, for speed, after updating the first 10 times).
-
-    // Just hard-code it here that we do 10 initial updates before skipping any.
-    const int num_initial_updates = 10;
-    if (frozen_ || t_ > t || (num_updates_skipped_ < update_period_ - 1 &&
-                              t_ >= num_initial_updates)) {
-      update_mutex_.unlock();
-      // We got the lock but we were already beaten to it by another thread, or
-      // we don't want to update yet due to update_period_ > 1 (this saves
-      // compute), so release the lock.
-      locked = false;
-    }
-  }
-
-  if (!locked) {
-    // We're not updating the parameters, either because another thread is
-    // working on updating them, or because another thread already did so from
-    // the same or later starting point (making our update stale), or because
-    // update_period_ > 1.  We just apply the preconditioning and return.
-
-    // note: we don't bother with any locks before checking frozen_ or incrementing
-    // num_updates_skipped_ below, because the worst that could happen is that,
-    // on very rare occasions, we could skip one or two more updates than we
-    // intended.
-    if (!frozen_)
-      num_updates_skipped_++;
-
-    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
+  if (!updating) {
+    // We're not updating the estimate of the Fisher matrix; we just apply the
+    // preconditioning and return.
     // X_hat_t = X_t - H_t W_t
     X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
-    // each element i of row_prod will be inner product of row i of X_hat_t with
-    // itself.
-    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-    *scale = gamma_t;
     return;
   }
   J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
@@ -457,31 +413,14 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   if (nf > 0 && self_debug_) {
     KALDI_WARN << "Floored " << nf << " elements of C_t.";
   }
-  BaseFloat tr_Xt_XtT_check;
-  if (self_debug_)
-    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
 
   X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
-  // set *row_prod to inner products of each row of X_hat_t with itself.
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  double tr_Xt_XtT = tr_Xhat_XhatT;
-  for (int32 i = 0; i < R; i++)
-    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
-  if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
-  }
-  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-  *scale = gamma_t;
 
   Vector<BaseFloat> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
 
   // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
+  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt
                                       + (1-eta)*(D * rho_t + d_t.Sum())
                                       - sqrt_c_t.Sum());
   // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
@@ -507,22 +446,25 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
                        &L_t);
   }
 
-  // Commit the new parameters.
-  read_write_mutex_.lock();
-  KALDI_ASSERT(t_ == t);  // we already ensured this.
-  t_ = t + 1;
-  num_updates_skipped_ = 0;
   W_t_.Swap(&W_t1);
   d_t_.CopyFromVec(d_t1);
   rho_t_ = rho_t1;
 
   if (self_debug_)
     SelfTest();
+}
+
+bool OnlineNaturalGradient::Updating() const {
+  // Just hard-code it here that we do 10 initial updates before skipping any.
+  // This must be > 'num_init_iters = 3' from Init().
+  const int num_initial_updates = 10;
 
-  read_write_mutex_.unlock();
-  update_mutex_.unlock();
+  return (!frozen_ &&
+          (t_ <= num_initial_updates ||
+           (t_ - num_initial_updates) % update_period_ == 0));
 }
 
+
 BaseFloat OnlineNaturalGradient::Eta(int32 N) const {
   if (num_minibatches_history_ > 0.0) {
     KALDI_ASSERT(num_minibatches_history_ > 1.0);
@@ -636,12 +578,10 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other)
     num_samples_history_(other.num_samples_history_),
     num_minibatches_history_(other.num_minibatches_history_),
     alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
-    frozen_(other.frozen_),
-    t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
+    frozen_(other.frozen_), t_(other.t_),
     self_debug_(other.self_debug_), W_t_(other.W_t_),
-    rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutexes.
-}
+    rho_t_(other.rho_t_), d_t_(other.d_t_) { }
+
 
 OnlineNaturalGradient& OnlineNaturalGradient::operator = (
     const OnlineNaturalGradient &other) {
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index 67c25eb0dbc..0b05948977e 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -437,33 +437,52 @@ class OnlineNaturalGradient {
   // see comment where 'frozen_' is declared.
   inline void Freeze(bool frozen) { frozen_ = frozen; }
 
-  // The "R" pointer is both the input (R in the comment) and the output (P in
-  // the comment; equal to the preconditioned directions before scaling by
-  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner product
-  // of each row of the preconditioned directions P, at output, with itself.
-  // You would need to apply "scale" to R and "scale * scale" to row_prod, to
-  // get the preconditioned directions; we don't do this ourselves, in order to
-  // save CUDA calls.
+  /**
+     This call implements the main functionality of this class.
+
+     @param [in,out] R  The "R" pointer is both the input (R in the
+            comment, X in the paper), and the output (P in the comment,
+            X with a hat on it in the paper).  Each row of R is viewed
+            as a vector in some space, where we're estimating a smoothed
+            Fisher matrix and then multiplying by the inverse of that
+            smoothed Fisher matrix.
+
+    @param [out] scale  If non-NULL, a scaling factor is written to here,
+            and the output 'R' should be multiplied by this factor by
+            the user (we don't do it internally, to save an operation).
+            The factor is chosen so that the vector 2-norm of R is the
+            same after the natural gradient as it was before.  (The pointer
+            being NULL or non-NULL doesn't affect the magnitude of R;
+            in any case the user will probably want to do this rescaling,
+            the question being whether they want to do so manually or
+            not.
+
+  */
   void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
-                              CuVectorBase<BaseFloat> *row_prod,
                               BaseFloat *scale);
 
+
+
   // Copy constructor.
   explicit OnlineNaturalGradient(const OnlineNaturalGradient &other);
   // Assignent operator
   OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other);
  private:
 
-  // This does the work of PreconditionDirections (the top-level
-  // function handles some multithreading issues and then calls this function).
+
+  // This is an internal function called from PreconditionDirections().
   // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ].
-  void PreconditionDirectionsInternal(const int32 t,
-                                      const BaseFloat rho_t,
+  void PreconditionDirectionsInternal(const BaseFloat rho_t,
+                                      const BaseFloat tr_X_Xt,
+                                      bool updating,
                                       const Vector<BaseFloat> &d_t,
                                       CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *X_t,
-                                      CuVectorBase<BaseFloat> *row_prod,
-                                      BaseFloat *scale);
+                                      CuMatrixBase<BaseFloat> *X_t);
+
+
+  // Works out from t_ and various class variables whether we will update
+  // the parameters on this iteration (returns true if so).
+  bool Updating() const;
 
   void ComputeEt(const VectorBase<BaseFloat> &d_t,
                  BaseFloat beta_t,
@@ -512,10 +531,14 @@ class OnlineNaturalGradient {
   // or columns.
   static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
 
-  // Returns the learning rate eta as the function of the number of samples
-  // (actually, N is the number of vectors we're preconditioning, which due to
-  // context is not always exactly the same as the number of samples).  The
-  // value returned depends on num_samples_history_.
+  // Returns the value eta (with 0 < eta < 1) which reflects how fast we update
+  // the estimate of the Fisher matrix (larger == faster).  This is a function
+  // rather than a constant because we set this indirectly, via
+  // num_samples_history_ or num_minibatches_history_.  The argument N is the
+  // number of vectors we're preconditioning, which is the number of rows in the
+  // argument R to PreconditionDirections(); you can think of it as the number
+  // of vectors we're preconditioning (and in the common case it's some multiple
+  // of the minibatch size)
   BaseFloat Eta(int32 N) const;
 
   // called if self_debug_ = true, makes sure the members satisfy certain
@@ -577,29 +600,16 @@ class OnlineNaturalGradient {
   // the *second* time we see the same data (to avoid biasing the update).
   bool frozen_;
 
-  // t is a counter that measures how many updates we've done.
+  // t is a counter that measures how many times the user has previously called
+  // PreconditionDirections(); it's 0 if that has never been called.
   int32 t_;
 
-  // This keeps track of how many minibatches we've skipped updating the parameters,
-  // since the most recent update; it's used in enforcing "update_period_", which
-  // is a mechanism to avoid spending too much time updating the subspace (which can
-  // be wasteful).
-  int32 num_updates_skipped_;
-
   // If true, activates certain checks.
   bool self_debug_;
 
   CuMatrix<BaseFloat> W_t_;
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
-
-
-  // Used to prevent parameters being read or written in an inconsistent state.
-  std::mutex read_write_mutex_;
-
-  // This mutex is used to control which thread gets to update the
-  // parameters, in multi-threaded code.
-  std::mutex update_mutex_;
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 9c48744fadc..ec1d3fa0f2e 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -238,6 +238,23 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const {
   return os.str();
 }
 
+NnetComputation::SubMatrixInfo ComputationVariables::VariableInfo(
+    int32 variable) const {
+  KALDI_ASSERT(variable >= 0 && variable < num_variables_);
+  int32 matrix_index = variable_to_matrix_[variable],
+      offset = variable - matrix_to_variable_index_[matrix_index],
+      num_column_variables = column_split_points_[matrix_index].size() - 1,
+      column_variable = offset % num_column_variables,
+      row_variable = offset / num_column_variables;
+  int32 row_offset = row_split_points_[matrix_index][row_variable],
+      num_rows = row_split_points_[matrix_index][row_variable+1] - row_offset,
+      col_offset = column_split_points_[matrix_index][column_variable],
+      num_cols = column_split_points_[matrix_index][column_variable+1] -
+                  col_offset;
+  return NnetComputation::SubMatrixInfo(matrix_index, row_offset, num_rows,
+                                        col_offset, num_cols);
+}
+
 
 /// given a vector of pairs from computation.indexes_multi_indexes
 /// containing paris (submatrix-index, row-index), this function outputs
@@ -367,6 +384,14 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
+      case kCompressMatrix: {
+        vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
+        break;
+      }
+      case kDecompressMatrix: {
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
+        break;
+      }
       case kAcceptInput: {
         vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         break;
@@ -555,6 +580,7 @@ void ComputationChecker::Check() {
   CheckComputationIndexes();
   a_.Init(nnet_, computation_);
   CheckComputationMatrixAccesses();
+  CheckComputationCompression();
   CheckComputationUndefined();
   CheckComputationDebugInfo();
   if (config_.check_rewrite)
@@ -608,16 +634,36 @@ void ComputationChecker::CheckComputationRewrite() const {
    Checks for the situation where a variable is read before being written.
 */
 void ComputationChecker::CheckComputationUndefined() const {
+  // the variable 'min_proportion' needs to be <= the min_proportion_ value in
+  // class MatrixExtender, otherwise this code could spuriously reject a
+  // computation.
+  BaseFloat min_proportion = 0.8;
+
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
     if (accesses.empty()) {
       if (config_.check_unused_variables) {
+        NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v);
+        const NnetComputation::MatrixInfo &matrix_info =
+            computation_.matrices[info.matrix_index];
+        // Before we throw an error, we want to check that it isn't a case that
+        // can be produced by the ExtendMatrices() optimization, that is
+        // actually allowed.  This is a case when a variable is inside the last
+        // few rows of a matrix, but not all columns of those last rows.
+        if (info.row_offset >= min_proportion * matrix_info.num_rows &&
+            !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) {
+          continue;
+        }
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v) << " is never used.";
       }
     } else {
-      if (accesses[0].access_type != kWriteAccess)
+      // It's OK if part of a matrix is compressed, that is undefined;
+      // likely that part won't be referred to when we uncompress.
+      if (accesses[0].access_type != kWriteAccess &&
+          !(computation_.commands[accesses[0].command_index].command_type ==
+            kCompressMatrix))
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v)
                   << " is read before it is written to";
@@ -647,9 +693,10 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
       KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
           "it is initialized";
     }
-    if (accesses.accesses.size() == 1) {
+    if (accesses.accesses.size() == 1 && config_.check_unused_variables) {
       int32 first_access_command = accesses.accesses[0].command_index;
       if (computation_.commands[first_access_command].command_type == kSetConst) {
+        if (!config_.check_unused_variables)
         KALDI_ERR << "Matrix m" << matrix_index << " is only set to a constant "
                   << "value, but then never accessed.";
       }
@@ -678,6 +725,64 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
   }
 }
 
+void ComputationChecker::CheckComputationCompression() const {
+  int32 num_matrices = a_.matrix_accesses.size();
+
+  // 'middle_command' will be the index of the command that separates
+  // the forward and backward passes.
+  int32 middle_command = -1;
+  for (size_t i = 0; i < computation_.commands.size(); i++) {
+    if (computation_.commands[i].command_type == kNoOperationMarker) {
+        middle_command = static_cast<int32>(i);
+        break;
+    }
+  }
+  for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
+    const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index];
+    int32 num_accesses = accesses.accesses.size();
+    for (int32 a = 0; a < num_accesses; a++) {
+      const Access &access = accesses.accesses[a];
+      int32 command_index = access.command_index;
+      const NnetComputation::Command &command =
+          computation_.commands[command_index];
+      if (command.command_type == kDecompressMatrix) {
+        // check that the previous access to this matrix was a compression
+        // command.
+        KALDI_ASSERT(
+            a > 0 && computation_.commands[
+                accesses.accesses[a-1].command_index].command_type ==
+            kCompressMatrix);
+      }
+      if (command.command_type == kCompressMatrix) {
+        // check that the next access to this matrix is an uncompression
+        // command.
+        int32 next_command_index = accesses.accesses[a+1].command_index;
+        KALDI_ASSERT(computation_.commands[next_command_index].command_type ==
+                     kDecompressMatrix &&
+                     command_index < middle_command &&
+                     next_command_index > middle_command);
+        if (command.alpha == 0.0) {
+          // alpha == 0.0 means we're only retaining the sign; we should
+          // only do this if this is the output of a ReLU.
+          // make sure there are only 2 commands after this: the uncompress
+          // command, and a relu backprop command.  (Any deallocation
+          // command doesn't show up in the list of 'accesses').
+          KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 &&
+                       num_accesses == a + 3);
+          // make sure the next access to that matrix, apart from the
+          // uncompression command, is a ReLU propagation.
+          int32 next_command_index = accesses.accesses[a+2].command_index;
+          const NnetComputation::Command &next_command =
+              computation_.commands[next_command_index];
+          KALDI_ASSERT(next_command.command_type == kBackprop &&
+                       nnet_.GetComponent(next_command.arg1)->Type() ==
+                       "RectifiedLinearComponent");
+        }
+      }
+    }
+  }
+}
+
 /**
    This very basic check just makes sure that all indexes in the commands are
    within range, that dimensions agree with the request, that row/column dimensions
@@ -930,6 +1035,26 @@ void ComputationChecker::CheckComputationIndexes() const {
         }
         break;
       }
+      case kCompressMatrix: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        if (c.arg2 < static_cast<int32>(kCompressedMatrixInt8) ||
+            c.arg2 > static_cast<int32>(kCompressedMatrixUint16))
+          KALDI_ERR << "Invalid compressed-matrix type.";
+        if (c.arg3 != 0 && c.arg3 != 1)
+          KALDI_ERR << "Invalid 'truncate' option for compressing matrix.";
+        if (c.alpha < 0.0 || c.alpha > 1000.0 ||
+            (c.alpha == 0.0 && c.arg2 != kCompressedMatrixUint8))
+          KALDI_ERR << "Invalid alpha in kCompressMatrix command.";
+        break;
+      }
+      case kDecompressMatrix: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        break;
+      }
       case kAcceptInput: case kProvideOutput: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             !computation_.IsWholeMatrix(c.arg1))
@@ -1081,6 +1206,23 @@ int32 ComputationAnalysis::FirstNontrivialAccess(int32 s) const {
 }
 
 
+int32 ComputationAnalysis::FirstAccess(int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 ans = computation_.commands.size();
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+          end = variable_indexes.end();
+  for (; iter != end; ++iter) {
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    if (!accesses.empty())
+      ans = std::min(ans, accesses[0].command_index);
+  }
+  return ans;
+}
+
+
 int32 ComputationAnalysis::FirstNontrivialMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
   int32 ans = computation_.commands.size();
@@ -1301,13 +1443,20 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       num_submatrices = computation.submatrices.size();
   for (int32 command_index = 0; command_index < num_commands; ++command_index) {
     const NnetComputation::Command &c = computation.commands[command_index];
-    int64 this_num_bytes = -100000000;
+    int64 this_num_bytes = -100000000,
+        this_compressed_num_bytes = -10000000;
     if (c.arg1 >= 0 && c.arg1 < num_submatrices) {
       // if arg1 could plausibly be a sub-matrix index...
       const NnetComputation::SubMatrixInfo &submat_info =
           computation.submatrices[c.arg1];
       this_num_bytes = static_cast<int64>(sizeof(BaseFloat)) *
           submat_info.num_rows * submat_info.num_cols;
+
+      this_compressed_num_bytes =
+          ((c.arg2 == static_cast<int32>(kCompressedMatrixInt8) ||
+            c.arg2 == static_cast<int32>(kCompressedMatrixUint8)) ?
+           1 : 2) * static_cast<int64>(submat_info.num_rows) *
+          submat_info.num_cols;
     }
     switch (c.command_type) {
       case kAllocMatrix:
@@ -1317,6 +1466,12 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       case kDeallocMatrix:
         cur_memory_use -= this_num_bytes;
         break;
+      case kCompressMatrix:
+        cur_memory_use += this_compressed_num_bytes - this_num_bytes;
+        break;
+      case kDecompressMatrix:
+        cur_memory_use += this_num_bytes - this_compressed_num_bytes;
+        break;
       default:
         break;
     }
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 259a4546d53..77466039756 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -160,6 +160,7 @@ class ComputationVariables {
   // zero indexing): something like "m1" or "m1(0:99,:)" or "m1(0:19,10:49)"
   std::string DescribeVariable(int32 variable) const;
 
+  NnetComputation::SubMatrixInfo VariableInfo(int32 variable) const;
  private:
   // sets up split_points_, matrix_to_variable_index_, and num_variables_.
   // called from constructor.
@@ -321,6 +322,13 @@ class ComputationAnalysis {
   /// s must be >0 (i.e. not the empty submatrix).
   int32 FirstNontrivialAccess(int32 s) const;
 
+  /// Returns the first command (read or write) that accesses any part of 's',
+  /// including possibly zeroing it.  [note: kAllocMatrix, kSwapMatrix and
+  /// kDeallocMatrix do not count as read or write operations].  If there is no
+  /// such command, it returns num_commands.  s must be >0 (i.e. not the empty
+  /// submatrix).
+  int32 FirstAccess(int32 s) const;
+
   /// Returns the last non-deallocation command that accesses any part of
   /// submatrix 's'; if there is no such command it returns -1.
   /// s must be >0 (i.e. not the empty submatrix).
@@ -385,7 +393,7 @@ struct CheckComputationOptions {
   // legitimately fail after optimization.  see code for details.
   bool check_rewrite;
   // If 'check_unused_variables' is true, it checks for unused variables
-  // (e.g. unused partsof matrices).  We only set it false for online
+  // (e.g. unused parts of matrices).  We only set it false for online
   // computations, where there can be instances where a part of a matrix is
   // apparently never accessed (until we consider that the matrix is swapped
   // with another).
@@ -407,15 +415,17 @@ class ComputationChecker {
                      const NnetComputation &computation);
   void Check();  // call this only once.
  private:
-  // various dimension consistency checks and checks on properties.
+  // Various dimension consistency checks and checks on properties.
   void CheckComputationIndexes() const;
-  // checks for a situation where an undefined variable is read.
+  // Checks for a situation where an undefined variable is read.
   void CheckComputationUndefined() const;
-  // checks that all writes are done before reads.  details with implementation.
+  // Checks that all writes are done before reads.  details with implementation.
   void CheckComputationRewrite() const;
-  // check matrix accesses make sense.
+  // Check matrix accesses make sense.
   void CheckComputationMatrixAccesses() const;
-  // check debug_info has the correct size, if used.
+  // Some checks related to the kCompressMatrix and kDecompressMatrix commands.
+  void CheckComputationCompression() const;
+  // Check debug_info has the correct size, if used.
   void CheckComputationDebugInfo() const;
 
   const CheckComputationOptions &config_;
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 3e6d8599382..2080c60077b 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -95,8 +95,11 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(nnet_config.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
@@ -120,6 +123,10 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   // happens when we use the model with batchnorm test-mode set).
   ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
 
+  // The following will only do something if we have a LinearComponent
+  // or AffineComponent with orthonormal-constraint set to a nonzero value.
+  ConstrainOrthonormal(nnet_);
+
   // Scale delta_nnet
   if (success)
     ScaleNnet(nnet_config.momentum, delta_nnet_);
@@ -131,8 +138,11 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
                                                const NnetComputation &computation,
                                                bool is_backstitch_step1) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(nnet_config.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
@@ -168,6 +178,21 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
       nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
+  if (is_backstitch_step1) {
+    // The following will only do something if we have a LinearComponent or
+    // AffineComponent with orthonormal-constraint set to a nonzero value. We
+    // choose to do this only on the 1st backstitch step, for efficiency.
+    ConstrainOrthonormal(nnet_);
+  }
+
+  if (!is_backstitch_step1) {
+    // Scale down the batchnorm stats (keeps them fresh... this affects what
+    // happens when we use the model with batchnorm test-mode set).  Do this
+    // after backstitch step 2 so that the stats are scaled down before we start
+    // the next minibatch.
+    ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
+  }
+
   ScaleNnet(0.0, delta_nnet_);
 }
 
@@ -196,9 +221,6 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2,
     bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> xent_deriv;
-    if (use_xent)
-      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
-                        kUndefined);
 
     BaseFloat tot_objf, tot_l2_term, tot_weight;
 
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 75350d3d8f6..31ff9819dfa 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -440,6 +440,11 @@ void PrintIndexes(std::ostream &os,
     os << "[ ]";
     return;
   }
+  // If the string is longer than 'max_string_length' characters, it will
+  // be summarized with '...' in the middle.
+  size_t max_string_length = 200;
+  std::ostringstream os_temp;
+
   // range_starts will be the starts of ranges (with consecutive t values and
   // the same n value and zero x values) that we compactly print.  we'll append
   // "end" to range_starts for convenience.n
@@ -457,23 +462,32 @@ void PrintIndexes(std::ostream &os,
   }
   range_starts.push_back(cur_start);
   range_starts.push_back(end);
-  os << "[";
+  os_temp << "[";
   int32 num_ranges = range_starts.size() - 1;
   for (int32 r = 0; r < num_ranges; r++) {
     int32 range_start = range_starts[r], range_end = range_starts[r+1];
     KALDI_ASSERT(range_end > range_start);
-    os << "(" << indexes[range_start].n << ",";
+    os_temp << "(" << indexes[range_start].n << ",";
     if (range_end == range_start + 1)
-      os << indexes[range_start].t;
+      os_temp << indexes[range_start].t;
     else
-      os << indexes[range_start].t << ":" << indexes[range_end - 1].t;
+      os_temp << indexes[range_start].t << ":" << indexes[range_end - 1].t;
     if (indexes[range_start].x != 0)
-      os << "," << indexes[range_start].x;
-    os << ")";
+      os_temp << "," << indexes[range_start].x;
+    os_temp << ")";
     if (r + 1 < num_ranges)
-      os << ", ";
+      os_temp << ", ";
+  }
+  os_temp << "]";
+
+  std::string str = os_temp.str();
+  if (str.size() <= max_string_length) {
+    os << str;
+  } else {
+    size_t len = str.size();
+    os << str.substr(0, max_string_length / 2) << " ... "
+       << str.substr(len - max_string_length / 2);
   }
-  os << "]";
 }
 
 void PrintCindexes(std::ostream &ostream,
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index fa8a2322e5a..1a5ceabab0e 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -357,7 +357,6 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           ComputationRequest *request1,
                                           ComputationRequest *request2,
                                           ComputationRequest *request3) {
-  bool has_ivector = (nnet.InputDim("ivector") > 0);
   int32 left_context, right_context;
   ComputeSimpleNnetContext(nnet, &left_context, &right_context);
 
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 82010fea58d..ce4bbd0940a 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -23,6 +23,7 @@
 #include <iomanip>
 #include "nnet3/nnet-component-itf.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
 #include "nnet3/nnet-attention-component.h"
@@ -331,24 +332,23 @@ std::string UpdatableComponent::Info() const {
 void NonlinearComponent::StoreStatsInternal(
     const CuMatrixBase<BaseFloat> &out_value,
     const CuMatrixBase<BaseFloat> *deriv) {
-  KALDI_ASSERT(out_value.NumCols() == InputDim());
+  KALDI_ASSERT(out_value.NumCols() == dim_);
 
   // Check we have the correct dimensions.
-  if (value_sum_.Dim() != InputDim() ||
-      (deriv != NULL && deriv_sum_.Dim() != InputDim())) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (value_sum_.Dim() != InputDim()) {
-      value_sum_.Resize(InputDim());
+  if (value_sum_.Dim() != dim_ ||
+      (deriv != NULL && deriv_sum_.Dim() != dim_)) {
+    if (value_sum_.Dim() != dim_) {
+      value_sum_.Resize(dim_);
       count_ = 0.0;
     }
-    if (deriv != NULL && deriv_sum_.Dim() != InputDim()) {
-      deriv_sum_.Resize(InputDim());
+    if (deriv != NULL && deriv_sum_.Dim() != dim_) {
+      deriv_sum_.Resize(dim_);
       count_ = 0.0;
       value_sum_.SetZero();
     }
   }
   count_ += out_value.NumRows();
-  CuVector<BaseFloat> temp(InputDim());
+  CuVector<BaseFloat> temp(dim_);
   temp.AddRowSumMat(1.0, out_value, 0.0);
   value_sum_.AddVec(1.0, temp);
   if (deriv != NULL) {
@@ -357,22 +357,39 @@ void NonlinearComponent::StoreStatsInternal(
   }
 }
 
+void NonlinearComponent::StoreBackpropStats(
+    const CuMatrixBase<BaseFloat> &out_deriv) {
+  // only store these stats about every 4 minibatches.
+  if (RandInt(0, 3) == 0)
+    return;
+
+  KALDI_ASSERT(out_deriv.NumCols() == dim_);
+
+  // Check we have the correct dimensions.
+  if (oderiv_sumsq_.Dim() != dim_) {
+    oderiv_sumsq_.Resize(dim_);
+    oderiv_count_ = 0.0;
+  }
+  CuVector<BaseFloat> temp(dim_);
+  temp.AddDiagMat2(1.0, out_deriv, kTrans, 0.0);
+  oderiv_sumsq_.AddVec(1.0, temp);
+  oderiv_count_ += out_deriv.NumRows();
+}
+
+
 void NonlinearComponent::ZeroStats() {
   value_sum_.SetZero();
   deriv_sum_.SetZero();
+  oderiv_sumsq_.SetZero();
   count_ = 0.0;
+  oderiv_count_ = 0.0;
   num_dims_self_repaired_ = 0.0;
   num_dims_processed_ = 0.0;
 }
 
 std::string NonlinearComponent::Info() const {
   std::stringstream stream;
-  if (InputDim() == OutputDim()) {
-    stream << Type() << ", dim=" << InputDim();
-  } else {
-    stream << Type() << ", input-dim=" << InputDim()
-           << ", output-dim=" << OutputDim();
-  }
+  stream << Type() << ", dim=" << dim_;
   if (block_dim_ != dim_)
     stream << ", block-dim=" << block_dim_;
   if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold))
@@ -392,19 +409,30 @@ std::string NonlinearComponent::Info() const {
     value_avg.Scale(1.0 / count_);
     stream << ", value-avg=" << SummarizeVector(value_avg);
     if (deriv_sum_.Dim() == dim_) {
-      Vector<double> deriv_avg_dbl(deriv_sum_);
-      Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+      Vector<double> deriv_avg(deriv_sum_);
       deriv_avg.Scale(1.0 / count_);
       stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
     }
   }
+  if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) {
+    Vector<double> oderiv_rms(oderiv_sumsq_);
+    oderiv_rms.Scale(1.0 / oderiv_count_);
+    // The ApplyMin() is so that the statement after it does not fail even if we
+    // had subtracted models (e.g. in full_progress.*.log).
+    oderiv_rms.ApplyFloor(0.0);
+    oderiv_rms.ApplyPow(0.5);
+    stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms)
+           << ", oderiv-count=" << oderiv_count_;
+  }
   return stream.str();
 }
 
 void NonlinearComponent::Scale(BaseFloat scale) {
   value_sum_.Scale(scale);
   deriv_sum_.Scale(scale);
+  oderiv_sumsq_.Scale(scale);
   count_ *= scale;
+  oderiv_count_ *= scale;
   num_dims_self_repaired_ *= scale;
   num_dims_processed_ *= scale;
 }
@@ -417,11 +445,16 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) {
     value_sum_.Resize(other->value_sum_.Dim());
   if (deriv_sum_.Dim() == 0 && other->deriv_sum_.Dim() != 0)
     deriv_sum_.Resize(other->deriv_sum_.Dim());
+  if (oderiv_sumsq_.Dim() == 0 && other->oderiv_sumsq_.Dim() != 0)
+    oderiv_sumsq_.Resize(other->oderiv_sumsq_.Dim());
   if (other->value_sum_.Dim() != 0)
     value_sum_.AddVec(alpha, other->value_sum_);
   if (other->deriv_sum_.Dim() != 0)
     deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  if (other->oderiv_sumsq_.Dim() != 0)
+    oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_);
   count_ += alpha * other->count_;
+  oderiv_count_ += alpha * other->oderiv_count_;
   num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_;
   num_dims_processed_ += alpha * other->num_dims_processed_;
 }
@@ -444,11 +477,27 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
   deriv_sum_.Read(is, binary);
   ExpectToken(is, binary, "<Count>");
   ReadBasicType(is, binary, &count_);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OderivRms>");
+    oderiv_sumsq_.Read(is, binary);
+    oderiv_sumsq_.ApplyPow(2.0);
+    ExpectToken(is, binary, "<OderivCount>");
+    ReadBasicType(is, binary, &oderiv_count_);
+  } else {
+    oderiv_count_ = 0.0;
+    oderiv_sumsq_.Resize(0);
+  }
   value_sum_.Scale(count_);
   deriv_sum_.Scale(count_);
+  oderiv_sumsq_.Scale(oderiv_count_);
 
   std::string token;
   ReadToken(is, binary, &token);
+  if (token[0] != '<') {
+    // this should happen only rarely, in case we couldn't push back the
+    // '<' to the stream in PeekToken().
+    token = '<' + token;
+  }
   if (token == "<NumDimsSelfRepaired>") {
     ReadBasicType(is, binary, &num_dims_self_repaired_);
     ReadToken(is, binary, &token);
@@ -492,14 +541,29 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   Vector<BaseFloat> temp(value_sum_);
   if (count_ != 0.0) temp.Scale(1.0 / count_);
   temp.Write(os, binary);
-  WriteToken(os, binary, "<DerivAvg>");
 
-  temp.Resize(deriv_sum_.Dim(), kUndefined);
+  WriteToken(os, binary, "<DerivAvg>");
+  temp.Resize(deriv_sum_.Dim());
   temp.CopyFromVec(deriv_sum_);
   if (count_ != 0.0) temp.Scale(1.0 / count_);
   temp.Write(os, binary);
+
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
+
+  WriteToken(os, binary, "<OderivRms>");
+  temp.Resize(oderiv_sumsq_.Dim());
+  temp.CopyFromVec(oderiv_sumsq_);
+  if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_);
+  // The ApplyMin() is so that the statement after it does not fail even if we
+  // had subtracted models (e.g. in full_progress.*.log).
+  temp.ApplyFloor(0.0);
+  temp.ApplyPow(0.5);
+  temp.Write(os, binary);
+
+  WriteToken(os, binary, "<OderivCount>");
+  WriteBasicType(os, binary, oderiv_count_);
+
   WriteToken(os, binary, "<NumDimsSelfRepaired>");
   WriteBasicType(os, binary, num_dims_self_repaired_);
   WriteToken(os, binary, "<NumDimsProcessed>");
@@ -520,7 +584,7 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
 }
 
 NonlinearComponent::NonlinearComponent():
-    dim_(-1), block_dim_(-1), count_(0.0),
+    dim_(-1), block_dim_(-1), count_(0.0), oderiv_count_(0.0),
     num_dims_self_repaired_(0.0), num_dims_processed_(0.0),
     self_repair_lower_threshold_(kUnsetThreshold),
     self_repair_upper_threshold_(kUnsetThreshold),
@@ -529,7 +593,8 @@ NonlinearComponent::NonlinearComponent():
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), block_dim_(other.block_dim_),
     value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_),
+    count_(other.count_), oderiv_sumsq_(other.oderiv_sumsq_),
+    oderiv_count_(other.oderiv_count_),
     num_dims_self_repaired_(other.num_dims_self_repaired_),
     num_dims_processed_(other.num_dims_processed_),
     self_repair_lower_threshold_(other.self_repair_lower_threshold_),
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 62e09cee80f..c34d550d681 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -351,20 +351,23 @@ class Component {
   ///     although most components will have much more info.
   virtual std::string Info() const;
 
-  /// This virtual function when called by
-  //    -- an UpdatableComponent scales the parameters
+  /// This virtual function when called on
+  ///    -- an UpdatableComponent scales the parameters
   ///      by "scale" when called by an UpdatableComponent.
-  //    -- a Nonlinear component (or another component that
-  ///      stores stats, like BatchNormComponent-- it relates
+  ///    -- a Nonlinear component (or another component that
+  ///      stores stats, like BatchNormComponent)-- it relates
   ///      to scaling activation stats, not parameters.
+  /// Otherwise it will normally do nothing.
   virtual void Scale(BaseFloat scale) {};
 
   /// This virtual function when called by
   ///    -- an UpdatableComponent adds the parameters of
   ///      another updatable component, times some constant, to the current
   ///      parameters.
-  ///    -- a NonlinearComponent it relates to adding stats
-  /// Otherwise it should do nothing.
+  ///    -- a NonlinearComponent (or another component that stores
+  ///       stats, like BatchNormComponent)-- it relates to adding
+  ///       stats.
+  /// Otherwise it will normally do nothing.
   virtual void Add(BaseFloat alpha, const Component &other) {};
 
   /// This virtual function only needs to be overwritten by Components that
@@ -587,7 +590,7 @@ class UpdatableComponent: public Component {
 
        block-dim     Defaults to dim, but may be any nonzero divisor of dim.  It affects the
                      self-repair, which will be done while treating the input/output as
-                     repeating blocks of size 'block-dim' (e.g. blocks of filtes).  It allows
+                     repeating blocks of size 'block-dim' (e.g. blocks of filters).  It allows
                      us to do self-repair on the filter level in CNNs.
                      Currently this only makes a difference for RectifiedLinearComponent.
 */
@@ -640,6 +643,10 @@ class NonlinearComponent: public Component {
   void StoreStatsInternal(const CuMatrixBase<BaseFloat> &out_value,
                           const CuMatrixBase<BaseFloat> *deriv = NULL);
 
+  // This function may be called from child class members during backprop.  It
+  // stores the 'oderiv_sumsq_' stats.
+  void StoreBackpropStats(const CuMatrixBase<BaseFloat> &out_deriv);
+
 
   const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
 
@@ -655,8 +662,16 @@ class NonlinearComponent: public Component {
   CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity
                                // (only applicable to element-by-element
                                // nonlinearities, not Softmax.
+  // Count corresponding to the stats in 'value_sum_' and 'deriv_sum_'
   double count_;
 
+  CuVector<double> oderiv_sumsq_;  // Sum-square of the derivative of the
+                                   // objective function, that we're propagating
+                                   // back.  Accumulated during the backprop;
+                                   // used for diagnostics.
+  // Count corresponding to the stats in 'oderiv_sumsq_'.
+  double oderiv_count_;
+
   // some stats for self-repairing nonlinearities.
   double num_dims_self_repaired_;
   double num_dims_processed_;
@@ -665,9 +680,6 @@ class NonlinearComponent: public Component {
   BaseFloat self_repair_lower_threshold_;
   BaseFloat self_repair_upper_threshold_;
   BaseFloat self_repair_scale_;
-
-  // The mutex is used in UpdateStats, only for resizing vectors.
-  std::mutex mutex_;
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 77facbdba79..a9a21bb3f24 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -282,6 +282,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kAddToRowsMulti;
     } else if (command_type_str == "kAddRowRanges") {
       command_type = kAddRowRanges;
+    } else if (command_type_str == "kCompressMatrix") {
+      command_type = kCompressMatrix;
+    } else if (command_type_str == "kDecompressMatrix") {
+      command_type = kDecompressMatrix;
     } else if (command_type_str == "kAcceptInput") {
       command_type = kAcceptInput;
     } else if (command_type_str == "kProvideOutput") {
@@ -375,6 +379,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kAddRowRanges:
         os << "kAddRowRanges\n";
         break;
+      case kCompressMatrix:
+        os << "kCompressMatrix\n";
+        break;
+      case kDecompressMatrix:
+        os << "kDecompressMatrix\n";
+        break;
       case kAcceptInput:
         os << "kAcceptInput\n";
         break;
@@ -500,13 +510,17 @@ static void GetIndexesMultiStrings(
 
 
 // writes to "os" the statement for this command.
-static void PrintCommand(std::ostream &os,
+static void PrintCommand(std::ostream &os_out,
                          const Nnet &nnet,
                          const NnetComputation &computation,
                          int32 command_index,
                          const std::vector<std::string> &submatrix_strings,
                          const std::vector<std::string> &indexes_strings,
                          const std::vector<std::string> &indexes_multi_strings) {
+  // If the string is longer than 'max_string_length' characters, it will
+  // be summarized with '...' in the middle.
+  size_t max_string_length = 200;
+  std::ostringstream os;
   KALDI_ASSERT(command_index < computation.commands.size());
   os << "c" << command_index << ": ";
   const NnetComputation::Command &c = computation.commands[command_index];
@@ -611,6 +625,25 @@ static void PrintCommand(std::ostream &os,
       os << "])\n";
       break;
     }
+    case kCompressMatrix: {
+      BaseFloat range = c.alpha;
+      std::string truncate = (c.arg3 != 0 ? "true" : "false");
+      std::string compressed_matrix_type;
+      if (c.arg2 == kCompressedMatrixInt8) { compressed_matrix_type = "int8"; }
+      else if (c.arg2 == kCompressedMatrixUint8) { compressed_matrix_type = "uint8"; }
+      else if (c.arg2 == kCompressedMatrixInt16) { compressed_matrix_type = "int16"; }
+      else {
+        KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16);
+        compressed_matrix_type = "uint16";
+      }
+      os << "CompressMatrix(" << submatrix_strings[c.arg1]
+         << range << ", " << compressed_matrix_type << ", "
+         << truncate << ")\n";
+      break;
+    }
+    case kDecompressMatrix:
+      os << "DecompressMatrix(" << submatrix_strings[c.arg1] << ")\n";
+      break;
     case kAcceptInput:
       os << submatrix_strings[c.arg1] << " = user input [for node: '"
          << nnet.GetNodeName(c.arg2) << "']\n";
@@ -637,6 +670,14 @@ static void PrintCommand(std::ostream &os,
     default:
       KALDI_ERR << "Un-handled command type.";
   }
+  std::string str = os.str();
+  if (str.size() <= max_string_length) {
+    os_out << str;
+  } else {
+    size_t len = str.size();
+    os_out << str.substr(0, max_string_length / 2) << " ... "
+           << str.substr(len - max_string_length / 2);
+  }
 }
 
 
@@ -689,7 +730,7 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
 }
 
 void NnetComputation::Read(std::istream &is, bool binary) {
-  int32 version = 4,  // must be in sync with 'version' in Write.
+  int32 version = 5,  // must be in sync with 'version' in Write.
       version_in = 1;  // defaults to 1 if no version specified.
 
   ExpectToken(is, binary, "<NnetComputation>");
@@ -823,7 +864,7 @@ void NnetComputation::Read(std::istream &is, bool binary) {
 }
 
 void NnetComputation::Write(std::ostream &os, bool binary) const {
-  int32 version = 4;  // Must be in sync with version in Read.
+  int32 version = 5;  // Must be in sync with version in Read.
   WriteToken(os, binary, "<NnetComputation>");
   WriteToken(os, binary, "<Version>");
   WriteBasicType(os, binary, version);
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 4b1386a1f01..d056a71498c 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -232,6 +232,17 @@ struct ComputationRequest {
      indexes_ranges[arg3].  We use the "alpha" as if AddRowRanges()
      accepted that argument, even though it doesn't (we fake it using other
      calls, if alpha != 1.0).
+   - kCompressMatrix: Compresses the matrix which should be referred to
+     by submatrix-index arg1.  arg2 is a number that determines the
+     compression type (it's converted from the enum
+     CuCompressedMatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha
+     determines the 'range' parameter (c.f. NewCuCompressedMatrix()).  arg3
+     will be converted to the 'truncate' argument to the class
+     CuCompressedMatrix; it should be false (0) if you know that the input is
+     limited to the allowed range, and true (1) if the input may exceed that
+     range (see docs for CuCompresedMatrix).
+   - kDecompressMatrix:  Decompresses the matrix which is referred to
+     by submatrix-index arg1 (it should previously have been compressed).
    - kAcceptInput: accepts a matrix of input from the user, which may be either
      features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
      a whole matrix that the input goes to, and arg2 is the index of the network
@@ -263,7 +274,8 @@ enum CommandType {
   kPropagate, kBackprop, kBackpropNoModelUpdate,
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-  kAddRowRanges, kAcceptInput, kProvideOutput,
+  kAddRowRanges, kCompressMatrix, kDecompressMatrix,
+  kAcceptInput, kProvideOutput,
   kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel,
   kGotoLabel };
 
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 87fa62c6112..19eecdda72b 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -30,22 +30,37 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options,
                            const Nnet &nnet,
                            Nnet *nnet_to_update):
     options_(options), computation_(computation), nnet_(nnet),
-    program_counter_(0), nnet_to_update_(nnet_to_update) {
-  KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() &&
- computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() &&
+    program_counter_(0), nnet_to_store_stats_(nnet_to_update),
+    nnet_to_update_(nnet_to_update) {
+  Init();
+}
+
+NnetComputer::NnetComputer(const NnetComputeOptions &options,
+                           const NnetComputation &computation,
+                           Nnet *nnet,
+                           Nnet *nnet_to_update):
+    options_(options), computation_(computation), nnet_(*nnet),
+    program_counter_(0), nnet_to_store_stats_(nnet),
+    nnet_to_update_(nnet_to_update) {
+  Init();
+}
+
+void NnetComputer::Init() {
+  KALDI_ASSERT(computation_.indexes_cuda.size() == computation_.indexes.size() &&
+ computation_.indexes_ranges_cuda.size() == computation_.indexes_ranges.size() &&
                "You must call NnetComputation::ComputeCudaIndexes() before "
                "executing the computation.");
-  matrices_.resize(computation.matrices.size());
+  matrices_.resize(computation_.matrices.size());
   debug_ = (options_.debug || GetVerboseLevel() >= 5);
   if (debug_) {
     ComputationVariables variables;
-    variables.Init(computation);
-    ComputeCommandAttributes(nnet, computation, variables,
+    variables.Init(computation_);
+    ComputeCommandAttributes(nnet_, computation_, variables,
                              &command_attributes_);
     std::string preamble;
-    computation.GetCommandStrings(nnet, &preamble, &command_strings_);
+    computation_.GetCommandStrings(nnet_, &preamble, &command_strings_);
     KALDI_LOG << preamble;
-    computation.GetSubmatrixStrings(nnet, &submatrix_strings_);
+    computation_.GetSubmatrixStrings(nnet_, &submatrix_strings_);
   }
 }
 
@@ -177,6 +192,7 @@ NnetComputer::NnetComputer(const NnetComputer &other):
     nnet_(other.nnet_),
     program_counter_(other.program_counter_),
     pending_commands_(other.pending_commands_),
+    nnet_to_store_stats_(other.nnet_to_store_stats_),
     nnet_to_update_(other.nnet_to_update_),
     debug_(other.debug_),
     command_attributes_(other.command_attributes_),
@@ -226,14 +242,14 @@ void NnetComputer::ExecuteCommand() {
         CuSubMatrix<BaseFloat> output(GetSubMatrix(c.arg4));
         void *memo = component->Propagate(indexes, input, &output);
         if (c.arg6) {  // need to store stats.
-          KALDI_ASSERT(nnet_to_update_ != NULL);
-          Component *upd_component = nnet_to_update_->GetComponent(c.arg1);
+          KALDI_ASSERT(nnet_to_store_stats_ != NULL);
+          Component *stats_component = nnet_to_store_stats_->GetComponent(c.arg1);
           bool was_in_place = (c.arg3 == c.arg4);
           // if propagate was in-place, provide empty matrix and not 'input', as
           // input is no longer valid.
           const CuSubMatrix<BaseFloat> maybe_input(
               GetSubMatrix(was_in_place ? 0 : c.arg3));
-          upd_component->StoreStats(maybe_input, output, memo);
+          stats_component->StoreStats(maybe_input, output, memo);
         }
         SaveMemo(c.arg5, *component, memo);
         break;
@@ -245,11 +261,21 @@ void NnetComputer::ExecuteCommand() {
         debug_str << nnet_.GetComponentName(c.arg1);
         const Component *component = nnet_.GetComponent(c.arg1);
         KALDI_ASSERT(!(computation_.need_model_derivative && !nnet_to_update_));
-        Component *upd_component = (nnet_to_update_ &&
-                                    c.command_type == kBackprop &&
-                                    computation_.need_model_derivative ?
-                                    nnet_to_update_->GetComponent(c.arg1) :
-                                    NULL);
+        Component *upd_component = NULL;
+        if (c.command_type == kBackprop) {  // this block sets 'upd_component'
+          Nnet *nnet_to_update;
+          if (component->Properties()&kUpdatableComponent) {
+            nnet_to_update = (computation_.need_model_derivative ?
+                              nnet_to_update_ : NULL);
+          } else {
+            // Some non-updatable components, such as CompositeComponent, store
+            // stats in the backprop.  For other types of non-updatable
+            // component, this arg won't matter.
+            nnet_to_update = nnet_to_store_stats_;
+          }
+          if (nnet_to_update)
+            upd_component = nnet_to_update->GetComponent(c.arg1);
+        }
         ComponentPrecomputedIndexes *indexes =
             computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> in_value(GetSubMatrix(c.arg3));
@@ -356,6 +382,42 @@ void NnetComputer::ExecuteCommand() {
         }
         break;
       }
+      case kCompressMatrix:
+        // This does nothing if CUDA is not in use.
+#if HAVE_CUDA == 1
+        if (CuDevice::Instantiate().Enabled()) {
+          if (compressed_matrices_.empty())
+            compressed_matrices_.resize(matrices_.size(), NULL);
+          int32 m = computation_.submatrices[c.arg1].matrix_index;
+          KALDI_ASSERT(compressed_matrices_[m] == NULL &&
+                       matrices_[m].NumRows() != 0);
+          BaseFloat range = c.alpha;
+          bool truncate = (c.arg3 != 0);
+          compressed_matrices_[m] = NewCuCompressedMatrix(
+              static_cast<CuCompressedMatrixType>(c.arg2),
+              range, truncate);
+          compressed_matrices_[m]->CopyFromMat(matrices_[m]);
+          matrices_[m].Resize(0, 0);
+        }
+        break;
+#endif
+      case kDecompressMatrix:
+#if HAVE_CUDA == 1
+        if (CuDevice::Instantiate().Enabled()) {
+          int32 m = computation_.submatrices[c.arg1].matrix_index;
+          CuCompressedMatrixBase *compressed_matrix =
+              compressed_matrices_[m];
+          KALDI_ASSERT(compressed_matrix != NULL &&
+                       matrices_[m].NumRows() == 0);
+          matrices_[m].Resize(compressed_matrix->NumRows(),
+                              compressed_matrix->NumCols(),
+                              kUndefined);
+          compressed_matrix->CopyToMat(&(matrices_[m]));
+          delete compressed_matrix;
+          compressed_matrices_[m] = NULL;
+        }
+#endif
+        break;
       case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel:
         break;
@@ -609,5 +671,14 @@ void NnetComputer::AcceptInputs(const Nnet &nnet,
   }
 }
 
+NnetComputer::~NnetComputer() {
+  // Delete any pointers that are present in compressed_matrices_.  Actually
+  // they should all already have been deallocated and set to NULL if the
+  // compuation was run to completion; we do this in case someone ran
+  // the forward propagation but not the backprop.
+  for (size_t i = 0; i < compressed_matrices_.size(); i++)
+    delete compressed_matrices_[i];
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index e16cbfbb393..333ed3168b9 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -62,15 +62,29 @@ class NnetComputer {
   /// model update or model-derivative computation.
   /// You must call computation.ComputeCudaIndexes()  before calling
   /// this function.
+  ///
+  /// Caution: there is another constructor that takes a pointer for
+  /// 'nnet', be careful not to mix these up.
   NnetComputer(const NnetComputeOptions &options,
                const NnetComputation &computation,
                const Nnet &nnet,
                Nnet *nnet_to_update);
 
-  /// Copy constructor.  May not be used if memos are involved (memos are only
-  /// possible if backprop will take place, and in these situations you won't
-  /// normally be wanting to use the copy constructor anyway; the copy
-  /// constructor is more useful for things like RNNLM lattice rescoring).
+  /// This version of the constructor accepts a pointer to 'nnet' instead
+  /// of a const reference.  The difference is that this version will,
+  /// for storing statistics (the StoreStats() function of class Component),
+  /// use 'nnet' instead of 'nnet_to_update' (if specified).
+  NnetComputer(const NnetComputeOptions &options,
+               const NnetComputation &computation,
+               Nnet *nnet,
+               Nnet *nnet_to_update);
+
+
+  /// Copy constructor.  May not be used if memos are stored with this object
+  /// (which is only a possibility if backprop will take place, and in these
+  /// situations you won't normally be wanting to use the copy constructor
+  /// anyway; the copy constructor is more useful for things like RNNLM lattice
+  /// rescoring).
   NnetComputer(const NnetComputer &other);
 
   /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the
@@ -111,10 +125,14 @@ class NnetComputer {
                             CuMatrix<BaseFloat> *output);
 
 
+  ~NnetComputer();
  private:
+  void Init(); // called from constructors.
+
   const NnetComputeOptions &options_;
   const NnetComputation &computation_;
   const Nnet &nnet_;
+
   int32 program_counter_;  // command index to execute next.
   // To deal with inputs and outputs that are not provided/taken by the user in
   // the same order as listed in the computation, pending_commands_ contains a
@@ -122,6 +140,13 @@ class NnetComputer {
   // executed.
   std::vector<int32> pending_commands_;
 
+  // A pointer to the copy of the nnet which we'll be using for stats
+  // accumulation (the StoreStats() function).  May be NULL or the same
+  // as nnet_ or nnet_to_update_.
+  Nnet *nnet_to_store_stats_;
+  // A pointer to the copy of the nnet which we'll be updating the parameters
+  // of (nnet_to_update in the backprop function).  May be NULL and usually
+  // will not be the same as nnet_.
   Nnet *nnet_to_update_;
   bool debug_;
   // command_attributes_ is only used if debug_=true.
@@ -139,6 +164,14 @@ class NnetComputer {
   // NULL).
   std::vector<void*> memos_;
 
+  // This is only used when commands kCompressMatrix and kDecompressMatrix are
+  // invoked.  It will be (the first time we compress a matrix) resized to be
+  // the same size as 'matrices_' (i.e., indexed by matrix index).  When we
+  // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and
+  // resize matrices_[m] to empty; and when we uncompress it, the reverse
+  // happens.
+  std::vector<CuCompressedMatrixBase*> compressed_matrices_;
+
 
   // executes the command in computation_.commands[program_counter_].
   void ExecuteCommand();
@@ -207,7 +240,6 @@ class NnetComputer {
   // memos are not reusable.
   inline void *GetMemo(int32 memo_index);
 
- private:
   NnetComputer &operator = (const NnetComputer &other);  // Disallow.
 };
 
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index f689984e876..bea3b9d31d5 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -263,18 +263,14 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("alpha-out", &alpha_out);
   cfl->GetValue("num-minibatches-history", &num_minibatches_history);
 
-  preconditioner_in_.SetAlpha(alpha_in);
-  preconditioner_out_.SetAlpha(alpha_out);
   int32 dim_in = linear_params_.NumCols() + 1,
       dim_out = linear_params_.NumRows();
-  if (rank_in < 0) {
+  if (rank_in < 0)
     rank_in = std::min<int32>(80, (dim_in + 1) / 2);
-    preconditioner_in_.SetRank(rank_in);
-  }
-  if (rank_out < 0) {
+  preconditioner_in_.SetRank(rank_in);
+  if (rank_out < 0)
     rank_out = std::min<int32>(80, (dim_out + 1) / 2);
-    preconditioner_out_.SetRank(rank_out);
-  }
+  preconditioner_out_.SetRank(rank_out);
   preconditioner_in_.SetNumMinibatchesHistory(num_minibatches_history);
   preconditioner_out_.SetNumMinibatchesHistory(num_minibatches_history);
 
@@ -360,29 +356,29 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_deriv) {
 
-  CuVector<BaseFloat> bias_temp(bias_params_.Dim());
+  CuVector<BaseFloat> bias_deriv(bias_params_.Dim());
 
-  { // this block computes 'bias_temp', the derivative w.r.t. the bias.
+  { // this block computes 'bias_deriv', the derivative w.r.t. the bias.
     KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() &&
                  out_deriv.NumCols() ==
                  model_.height_out * model_.num_filters_out);
     CuSubMatrix<BaseFloat> out_deriv_reshaped(
         out_deriv.Data(), out_deriv.NumRows() * model_.height_out,
         model_.num_filters_out, model_.num_filters_out);
-    bias_temp.AddRowSumMat(1.0, out_deriv_reshaped);
+    bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
   }
 
-  CuMatrix<BaseFloat> params_temp(linear_params_.NumRows(),
+  CuMatrix<BaseFloat> params_deriv(linear_params_.NumRows(),
                                   linear_params_.NumCols() + 1);
-  params_temp.CopyColFromVec(bias_temp, linear_params_.NumCols());
+  params_deriv.CopyColFromVec(bias_deriv, linear_params_.NumCols());
 
 
-  CuSubMatrix<BaseFloat> linear_params_temp(
-      params_temp, 0, linear_params_.NumRows(),
+  CuSubMatrix<BaseFloat> linear_params_deriv(
+      params_deriv, 0, linear_params_.NumRows(),
       0, linear_params_.NumCols());
 
   ConvolveBackwardParams(indexes.computation, in_value, out_deriv,
-                         1.0, &linear_params_temp);
+                         1.0, &linear_params_deriv);
 
   // the precondition-directions code outputs a scalar that
   // must be multiplied by its output (this saves one
@@ -393,22 +389,19 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient(
   // scalars are different across iterations, the scalars
   // will be pretty similar on different iterations
   BaseFloat scale1, scale2;
-  preconditioner_in_.PreconditionDirections(&params_temp, NULL,
-                                            &scale1);
-
+  preconditioner_in_.PreconditionDirections(&params_deriv, &scale1);
 
-  CuMatrix<BaseFloat> params_temp_transpose(params_temp, kTrans);
-  preconditioner_out_.PreconditionDirections(&params_temp_transpose,
-                                             NULL, &scale2);
 
+  CuMatrix<BaseFloat> params_deriv_transpose(params_deriv, kTrans);
+  preconditioner_out_.PreconditionDirections(&params_deriv_transpose, &scale2);
 
   linear_params_.AddMat(
       learning_rate_ * scale1 * scale2,
-      params_temp_transpose.RowRange(0, linear_params_.NumCols()),
+      params_deriv_transpose.RowRange(0, linear_params_.NumCols()),
       kTrans);
 
   bias_params_.AddVec(learning_rate_ * scale1 * scale2,
-                      params_temp_transpose.Row(linear_params_.NumCols()));
+                      params_deriv_transpose.Row(linear_params_.NumCols()));
 }
 
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 6b90787ea95..dd6e950a7d1 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1252,7 +1252,7 @@ void ConstantComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-                                                          NULL, &scale);
+                                                          &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
new file mode 100644
index 00000000000..d10c6fabd36
--- /dev/null
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -0,0 +1,680 @@
+// nnet3/nnet-normalize-component.cc
+
+// Copyright      2015-2017  Johns Hopkins University (author: Daniel Povey)
+//                2015  Guoguo Chen
+//                2015  Daniel Galvez
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-normalize-component.h"
+#include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-math.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+const BaseFloat NormalizeComponent::kSquaredNormFloor =
+    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+
+NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
+    input_dim_(other.input_dim_), block_dim_(other.block_dim_),
+    target_rms_(other.target_rms_),
+    add_log_stddev_(other.add_log_stddev_) { }
+
+void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
+  input_dim_ = 0;
+  add_log_stddev_ = false;
+  target_rms_ = 1.0;
+  bool ok = cfl->GetValue("dim", &input_dim_) ||
+      cfl->GetValue("input-dim", &input_dim_);
+  block_dim_ = input_dim_;
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("target-rms", &target_rms_);
+  cfl->GetValue("add-log-stddev", &add_log_stddev_);
+  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 ||
+      block_dim_ <= 0 || input_dim_ % block_dim_ != 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+}
+
+void NormalizeComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<NormalizeComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_); // Read dimension.
+  ReadToken(is, binary, &token);
+  if (token == "<BlockDim>") {
+    ReadBasicType(is, binary, &block_dim_);
+    ReadToken(is, binary, &token);
+  } else {
+    block_dim_ = input_dim_;
+  }
+  // read target_rms_ if it is available.
+  if (token == "<TargetRms>") {
+    ReadBasicType(is, binary, &target_rms_);
+    ReadToken(is, binary, &token);
+  }
+  //  Read add_log_stddev_ token, if it is available.
+  if (token == "<AddLogStddev>") {
+    ReadBasicType(is, binary, &add_log_stddev_);
+    ReadToken(is, binary, &token);
+  } else {
+    add_log_stddev_ = false;
+  }
+  if (token == "<ValueAvg>") {
+    // back-compatibility code.
+    CuVector<double> temp;
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<DerivAvg>");
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<Count>");
+    double count;
+    ReadBasicType(is, binary, &count);
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "</NormalizeComponent>");
+}
+
+void NormalizeComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NormalizeComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  if (block_dim_ != input_dim_) {
+    WriteToken(os, binary, "<BlockDim>");
+    WriteBasicType(os, binary, block_dim_);
+  }
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<AddLogStddev>");
+  WriteBasicType(os, binary, add_log_stddev_);
+  WriteToken(os, binary, "</NormalizeComponent>");
+}
+
+std::string NormalizeComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
+         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
+  if (block_dim_ != input_dim_)
+    stream << ", block-dim=" << block_dim_;
+  return stream.str();
+}
+
+// The output y_i = scale * x_i,
+// and we want to RMS value of the y_i to equal target_rms,
+// so y^t y = D * target_rms^2 (if y is one row of the input).
+// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+// there is also flooring involved, to avoid division-by-zero
+// problems.  It's important for the backprop, that the floor's
+// square root is exactly representable as float.
+// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
+void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() &&
+               in.NumRows() == out->NumRows());
+  if (block_dim_ != input_dim_) {
+    int32 num_blocks = input_dim_ / block_dim_,
+        new_num_rows = in.NumRows() * num_blocks,
+        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_num_rows,
+                                       block_dim_, block_dim_),
+        out_reshaped(out->Data(), new_num_rows,
+                     output_block_dim, output_block_dim);
+    cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_,
+                        &out_reshaped);
+  } else {
+    cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out);
+  }
+  return NULL;
+}
+
+/*
+  A note on the derivative of NormalizeComponent...
+  let both row_in and row_out be vectors of dimension D.
+  Let p = row_in^T row_in / (D * target_rms^2), and let
+  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+  row_out = f row_in.
+  Suppose we have a quantity deriv_out which is the derivative
+  of the objective function w.r.t. row_out.  We want to compute
+  deriv_in which is the derivative of the objective function w.r.t.
+  row_in.  Let the objective function be F.  One term is obvious: we have
+  deriv_in = f deriv_out + ....
+  next we have to take into account the derivative that gets back-propagated
+  through f.  Obviously, dF/df = deriv_out^T row_in.
+  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+  So this term in dF/d(row_in) equals:
+  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+  So
+  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+
+  if add_log_stddev_ true, the deriv_in has another term as
+  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+*/
+void NormalizeComponent::Backprop(const std::string &debug_info,
+                                  const ComponentPrecomputedIndexes *indexes,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &, // out_value
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  void *memo,
+                                  Component *to_update,
+                                  CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)
+    return;
+  if (block_dim_ != input_dim_) {
+    int32 num_blocks = input_dim_ / block_dim_,
+        new_num_rows = in_value.NumRows() * num_blocks,
+        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
+    KALDI_ASSERT(in_value.Stride() == in_value.NumCols() &&
+                 out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(), new_num_rows,
+                                             block_dim_, block_dim_),
+        out_deriv_reshaped(out_deriv.Data(), new_num_rows,
+                           output_block_dim, output_block_dim),
+        in_deriv_reshaped(in_deriv->Data(), new_num_rows,
+                          block_dim_, block_dim_);
+    cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_,
+                            add_log_stddev_, &in_deriv_reshaped);
+  } else {
+    cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
+                            in_deriv);
+  }
+}
+
+void BatchNormComponent::ComputeDerived() {
+  if (!test_mode_) {
+    offset_.Resize(0);
+    scale_.Resize(0);
+    return;
+  }
+
+  if (count_ == 0.0) {
+    KALDI_WARN << "Test-mode is set but there is no data count.  "
+        "Creating random counts.  This only makes sense "
+        "in unit-tests (or compute_prob_*.0.log).  If you see this "
+        "elsewhere, something is very wrong.";
+    count_ = 1.0;
+    stats_sum_.SetRandn();
+    stats_sumsq_.SetRandn();
+    stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
+  }
+
+  offset_.Resize(block_dim_);
+  scale_.Resize(block_dim_);
+  offset_.CopyFromVec(stats_sum_);
+  offset_.Scale(-1.0 / count_);
+  // now offset_ is -mean.
+  scale_.CopyFromVec(stats_sumsq_);
+  scale_.Scale(1.0 / count_);
+  scale_.AddVecVec(-1.0, offset_, offset_, 1.0);
+  // now scale_ is variance.
+  // Mathematically the ApplyFloor statement should be a no-op; this is in case
+  // of numerical roundoff.
+  scale_.ApplyFloor(0.0);
+  scale_.Add(epsilon_);
+  BaseFloat power = -0.5;
+  scale_.ApplyPow(power);
+  // now scale_ = min(variance, epsilon)^power
+  // next, multiply by the target RMS (normally 1.0).
+  scale_.Scale(target_rms_);
+  offset_.MulElements(scale_);
+  // now offset_ is -(scale*mean).
+}
+
+void BatchNormComponent::SetTestMode(bool test_mode) {
+  test_mode_ = test_mode;
+  ComputeDerived();
+}
+
+void BatchNormComponent::Check() const {
+  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+               epsilon_ > 0.0 && target_rms_ > 0.0);
+}
+
+BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
+    dim_(other.dim_), block_dim_(other.block_dim_),
+    epsilon_(other.epsilon_), target_rms_(other.target_rms_),
+    test_mode_(other.test_mode_), count_(other.count_),
+    stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) {
+  ComputeDerived();
+  Check();
+}
+
+
+std::string BatchNormComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
+         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
+         << ", count=" << count_
+         << ", test-mode=" << (test_mode_ ? "true" : "false");
+  if (count_ > 0) {
+    Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
+    mean.Scale(1.0 / count_);
+    var.Scale(1.0 / count_);
+    // subtract mean^2 from var.
+    var.AddVecVec(-1.0, mean, mean, 1.0);
+    var.ApplyFloor(0.0);
+    var.ApplyPow(0.5);  // make it the stddev.
+    stream << ", data-mean=" << SummarizeVector(mean)
+           << ", data-stddev=" << SummarizeVector(var);
+  }
+  return stream.str();
+}
+
+void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = -1;
+  block_dim_ = -1;
+  epsilon_ = 1.0e-03;
+  target_rms_ = 1.0;
+  test_mode_ = false;
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("epsilon", &epsilon_);
+  cfl->GetValue("target-rms", &target_rms_);
+  cfl->GetValue("test-mode", &test_mode_);
+  if (!ok || dim_ <= 0) {
+    KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0";
+  }
+  if (block_dim_ == -1)
+    block_dim_ = dim_;
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+        epsilon_ > 0 && target_rms_ > 0))
+    KALDI_ERR << "Invalid configuration in BatchNormComponent.";
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  count_ = 0;
+  stats_sum_.Resize(block_dim_);
+  stats_sumsq_.Resize(block_dim_);
+  if (test_mode_) {
+    ComputeDerived();
+  }
+}
+
+
+
+/*
+  BATCHNORM_MATH
+
+  This comment describes the equations involved in batch normalization, and
+  derives the forward and back-propagation.
+
+  This is all dimension-by-dimension, so we just imagine the inputs
+  are scalars x(i), for i=0 .. n-1.
+
+  FORWARD PASS:
+
+  Let 'power' be a constant, equal to -0.5 for regular batch-norm.
+
+  To simplify the math we (conceptually, not physically) do the normalization in
+  two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i).
+
+  The name 'rscale' means 'raw scale', meaning the scale before including
+  target-rms.  Later we'll define 'scale = target-rms * rscale', to make some
+  of the actual computations slightly more efficient.
+
+  Define:   mean = 1/I * sum_i x(i)
+            y(i) = x(i) - mean
+
+            var = 1/I \sum_i y(i)^2
+         rscale = sqrt(var + epsilon)^power   <---- For regular batchnorm, power == -0.5.
+           z(i) = target-rms * rscale * y(i)
+
+
+  Most of the rest of this comment derives how to compute the derivatives.  If
+  you just want the formulas, please skip to the string 'BACKWARD PASS' below.
+
+  We'll use a notation where an apostrophe on something means (the derivative of
+  the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
+  We are given y'(i).  Propagating the derivatives backward:
+
+    rscale' = (sum_i y(i) z'(i)) * target-rms
+            = (sum_i z(i) z'(i)) / rscale
+
+  [ note: d(rscale)/d(var) = power * (var + epsilon)^{power - 1}
+                           = power * rscale^{(power-1)/power}  ]
+
+    var' = rscale' * power * rscale^{(power-1)/power}
+         = power * (\sum_i z'(i) z(i)) * rscale^{(power-1)/power - 1}
+         = power * (\sum_i z'(i) z(i)) * rscale^{-1/power}
+
+  [note: the following formula is of the form "direct term" + "indirect term"]
+    y'(i) =  z'(i) * target-rms * rscale   +    2/I y(i) var'
+
+  Now, the above is inconvenient because it contains y(i) which is an intermediate
+  quantity.  We reformulate in terms of z(i), using y(i) = z(i) / (target-rms * rscale), so:
+
+  defining
+   var_deriv_mod = 2/I * var' / (target-rms * rscale)
+                 = 2/I * power/target-rms * (\sum_i z'(i) z(i)) * rscale^{-(1+power)/power}
+ we have:
+    y'(i) =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod
+
+ Now,
+    mean' = \sum_i y'(i)
+          = (target-rms * rscale * \sum_i z'(i))  +  (var_deriv_mod \sum_i z(i))
+     [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...]
+          = target-rms * rscale * \sum_i z(i)
+ and:
+    x'(i) =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod   -  1/I mean'
+          =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod   -  1/I * target-rms * rscale * \sum_i z'(i)
+          =  target-rms * rscale * (z'(i) - 1/I * \sum_i z'(i))  +  z(i) var_deriv_mod
+
+    It will simplify the code if we define:
+
+      scale = target-rms * rscale.  This way, we can write as follows:
+
+  BACKWARD PASS (recap):
+
+   var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+                .. which for power = -0.5, simplifies to:
+   var_deriv_mod = -1.0 / (target-rms^2) * (1/I \sum_i z'(i) z(i)) * scale
+
+           x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
+
+  */
+void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(SameDim(in, *out) &&
+               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
+  if (in.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
+        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
+        new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
+        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
+    return Propagate(indexes, in_reshaped, &out_reshaped);
+  }
+
+  // From this point, we can assume that the num-cols of 'in' and 'out'
+  // equals block_dim_.
+
+  if (!test_mode_) {
+    // search in the comment above for FORWARD PASS to see what is being
+    // implemented here.
+    // if this takes too much time due to multiple different CUDA calls,
+    // we'll consider making a single kernel for some of it.
+    Memo *memo = new Memo;
+    int32 num_frames = in.NumRows(), dim = block_dim_;
+    memo->num_frames = num_frames;
+    memo->mean_uvar_scale.Resize(5, dim);
+    CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
+        uvar(memo->mean_uvar_scale, 1),
+        scale(memo->mean_uvar_scale, 2);
+    mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
+    uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
+    scale.CopyFromVec(uvar);
+
+    // by applying this scale at this point, we save a multiply later on.
+    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
+    scale.AddVecVec(-var_scale, mean, mean, var_scale);
+    // at this point, 'scale' contains just the variance (times target-rms^{-2}).
+    scale.ApplyFloor(0.0);
+    scale.Add(var_scale * epsilon_);
+    // Now 'scale' contains the variance floored to zero and then with epsilon
+    // added [both times 1/target-rms^2].
+    scale.ApplyPow(-0.5);
+    // now 'scale' is the actual scale we'll use.
+
+    // the next command will do no work if out == in, for in-place propagation.
+    out->CopyFromMat(in);
+    out->AddVecToRows(-1.0, mean, 1.0);
+    out->MulColsVec(scale);
+    return static_cast<void*>(memo);
+  } else {
+    if (offset_.Dim() != block_dim_) {
+      if (count_ == 0)
+        KALDI_ERR << "Test mode set in BatchNormComponent, but no stats.";
+      else  // why was ComputeDerived() not called?
+        KALDI_ERR << "Code error in BatchNormComponent";
+    }
+    out->CopyFromMat(in);
+    out->MulColsVec(scale_);
+    out->AddVecToRows(1.0, offset_, 1.0);
+    return NULL;
+  }
+}
+
+void BatchNormComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,  // unused
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo_in,
+    Component *to_update,  // unused
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               SameDim(out_value, *in_deriv) &&
+               (out_value.NumCols() == dim_ ||
+                out_value.NumCols() == block_dim_));
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
+                 out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols),
+        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
+        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
+    // we'll never use in_value, so pass it in unchanged.
+    Backprop(debug_info, indexes, in_value,
+             out_value_reshaped, out_deriv_reshaped,
+             memo_in, to_update, &in_deriv_reshaped);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+
+  if (!test_mode_) {
+    // search above for BACKWARD PASS for a comment describing the math.
+    KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
+    int32 num_frames = memo->num_frames;
+    KALDI_ASSERT(out_value.NumRows() == num_frames);
+    CuSubVector<BaseFloat>
+        scale(memo->mean_uvar_scale, 2),
+        var_deriv_mod(memo->mean_uvar_scale, 3),
+        temp(memo->mean_uvar_scale, 4);
+
+    // var_deriv_mod is going to contain:
+    //  2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+    // which for power = -0.5 simplifies to:
+    // -1.0 / (target_rms * target_rms).
+    // but for now we don't have the power of 'scale', we'll add that later.
+    BaseFloat coeff = -1.0 / (target_rms_ * target_rms_ * num_frames);
+
+    var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans,
+                                out_deriv, kNoTrans, 0.0);
+    var_deriv_mod.MulElements(scale);
+
+    temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
+    // the following statement does no work if in_deriv and out_deriv are the
+    // same matrix.
+    in_deriv->CopyFromMat(out_deriv);
+    in_deriv->AddVecToRows(1.0, temp);
+    // At this point, *in_deriv contains
+    // (z'(i) - 1/I * \sum_i z'(i))
+    in_deriv->MulColsVec(scale);
+    // At this point, *in_deriv contains
+    // scale * (z'(i) - 1/I * \sum_i z'(i))
+
+    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans,
+                            var_deriv_mod, 1.0);
+
+    // At this point, *in_deriv contains what we described in the comment
+    // starting BATCHNORM_MATH as:
+    // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
+  } else {
+    KALDI_ASSERT(offset_.Dim() == block_dim_);
+    // the next call does no work if they point to the same memory.
+    in_deriv->CopyFromMat(out_deriv);
+    in_deriv->MulColsVec(scale_);
+  }
+}
+
+void BatchNormComponent::StoreStats(
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    void *memo_in) {
+  // in test mode this component does not store stats, it doesn't provide the
+  // kStoresStats flag.
+  KALDI_ASSERT(!test_mode_);
+  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols);
+    // we'll never use in_value, so just pass it in unchanged.
+    StoreStats(in_value, out_value_reshaped, memo_in);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
+
+  CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
+      uvar(memo->mean_uvar_scale, 1);
+  KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0);
+  BaseFloat num_frames = memo->num_frames;
+  if (stats_sum_.Dim() != block_dim_) {
+    stats_sum_.Resize(block_dim_);
+    stats_sumsq_.Resize(block_dim_);
+    KALDI_ASSERT(count_ == 0);
+  }
+  count_ += num_frames;
+  stats_sum_.AddVec(num_frames, mean, 1.0);
+  stats_sumsq_.AddVec(num_frames, uvar, 1.0);
+}
+
+void BatchNormComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<BatchNormComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<Epsilon>");
+  ReadBasicType(is, binary, &epsilon_);
+  ExpectToken(is, binary, "<TargetRms>");
+  ReadBasicType(is, binary, &target_rms_);
+  ExpectToken(is, binary, "<TestMode>");
+  ReadBasicType(is, binary, &test_mode_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  ExpectToken(is, binary, "<StatsMean>");
+  stats_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<StatsVar>");
+  stats_sumsq_.Read(is, binary);
+  stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
+  stats_sum_.Scale(count_);
+  stats_sumsq_.Scale(count_);
+  ExpectToken(is, binary, "</BatchNormComponent>");
+  ComputeDerived();
+  Check();
+}
+
+void BatchNormComponent::Write(std::ostream &os, bool binary) const {
+  Check();
+  WriteToken(os, binary, "<BatchNormComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<Epsilon>");
+  WriteBasicType(os, binary, epsilon_);
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary,  count_);
+  CuVector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
+  if (count_ != 0) {
+    mean.Scale(1.0 / count_);
+    var.Scale(1.0 / count_);
+    var.AddVecVec(-1.0, mean, mean, 1.0);
+  }
+  WriteToken(os, binary, "<StatsMean>");
+  mean.Write(os, binary);
+  WriteToken(os, binary, "<StatsVar>");
+  var.Write(os, binary);
+  WriteToken(os, binary, "</BatchNormComponent>");
+}
+
+void BatchNormComponent::Scale(BaseFloat scale) {
+  if (scale == 0) {
+    count_ = 0.0;
+    stats_sum_.SetZero();
+    stats_sumsq_.SetZero();
+  } else {
+    count_ *= scale;
+    stats_sum_.Scale(scale);
+    stats_sumsq_.Scale(scale);
+  }
+}
+
+
+void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const BatchNormComponent *other =
+      dynamic_cast<const BatchNormComponent*>(&other_in);
+  count_ += alpha * other->count_;
+  stats_sum_.AddVec(alpha, other->stats_sum_);
+  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
+  // this operation might change offset_ and scale_, so we recompute them
+  // in this instance (but not in Scale()).
+  ComputeDerived();
+}
+
+void BatchNormComponent::ZeroStats() {
+  // We only zero the stats if we're not in test mode.  In test mode, this would
+  // be dangerous as the stats are the source for the transform, and zeroing
+  // them and then calling ComputeDerived() again would remove the transform
+  // parameters (offset_ and scale_).
+  if (!test_mode_) {
+    count_ = 0.0;
+    stats_sum_.SetZero();
+    stats_sumsq_.SetZero();
+  }
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
new file mode 100644
index 00000000000..1806fe38493
--- /dev/null
+++ b/src/nnet3/nnet-normalize-component.h
@@ -0,0 +1,303 @@
+// nnet3/nnet-normalize-component.h
+
+// Copyright 2011-2013  Karel Vesely
+//           2012-2015  Johns Hopkins University (author: Daniel Povey)
+//                2013  Xiaohui Zhang
+//           2014-2015  Vijayaditya Peddinti
+//           2014-2015  Guoguo Chen
+//                2015  Daniel Galvez
+//                2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_
+#define KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_
+
+#include "nnet3/nnet-common.h"
+#include "nnet3/nnet-component-itf.h"
+#include "nnet3/natural-gradient-online.h"
+#include <iostream>
+
+namespace kaldi {
+namespace nnet3 {
+
+/// @file  nnet-normalize-component.h
+///
+///   This file contains declarations of components that in one way or
+///   another normalize their input: NormalizeComponent and BatchNormComponent.
+
+/*
+   NormalizeComponent implements the function:
+
+         y = x * (sqrt(dim(x)) * target-rms) / |x|
+
+   where |x| is the 2-norm of the vector x.  I.e. its output is its input
+   scaled such that the root-mean-square values of its elements equals
+   target-rms.  (As a special case, if the input is zero, it outputs zero).
+   This is like Hinton's layer-norm, except not normalizing the mean, only
+   the variance.
+
+
+    Note: if you specify add-log-stddev=true, it adds an extra element to
+     y which equals log(|x| / sqrt(dim(x))).
+
+
+   Configuration values accepted:
+      dim, or input-dim    Input dimension of this component, e.g. 1024.
+                           Will be the same as the output dimension if add-log-stddev=false.
+      block-dim            Defaults to 'dim' you may specify a nonzero divisor
+                           of 'dim'.  In this case the input dimension will
+                           be interpreted as blocks of dimension 'block-dim'
+                           to which the nonlinearity described above is applied
+                           separately.
+      add-log-stddev       You can set this to true to add an extra output
+                           dimension which will equal |x| / sqrt(dim(x)).
+                           If block-dim is specified, this is done per block.
+      target-rms           This defaults to 1.0, but if set it to another
+                           (nonzero) value, the output will be scaled by this
+                           factor.
+ */
+class NormalizeComponent: public Component {
+ public:
+  explicit NormalizeComponent(const NormalizeComponent &other);
+
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds|
+        (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) |
+        (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0);
+  }
+  NormalizeComponent() { }
+  virtual std::string Type() const { return "NormalizeComponent"; }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual Component* Copy() const { return new NormalizeComponent(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0));
+  }
+  virtual std::string Info() const;
+ private:
+  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
+  enum { kExpSquaredNormFloor = -66 };
+  // kSquaredNormFloor is about 0.7e-20.  We need a value that's exactly representable in
+  // float and whose inverse square root is also exactly representable
+  // in float (hence, an even power of two).
+  static const BaseFloat kSquaredNormFloor;
+  int32 input_dim_;
+  int32 block_dim_;
+  BaseFloat target_rms_; // The target rms for outputs, default 1.0.
+
+  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
+                        // is an extra dimension of the output.
+};
+
+
+/*
+  BatchNormComponent
+
+  This implements batch normalization; for each dimension of the
+  input it normalizes the data to be zero-mean, unit-variance.  You
+  can set the block-dim configuration value to implement spatial
+  batch normalization, see the comment for the variable.
+
+  If you want to combine this with the trainable offset and scale that the
+  original BatchNorm paper used, then follow this by the
+  ScaleAndOffsetComponent.
+
+  It's a simple component (uses the kSimpleComponent flag), but it is unusual in
+  that it will give different results if you call it on half the matrix at a
+  time.  Most of the time this would be pretty harmless, so we still return the
+  kSimpleComponent flag.  We may have to modify the test code a little to
+  account for this, or possibly remove the kSimpleComponent flag.  In some sense
+  each output Index depends on every input Index, but putting those dependencies
+  explicitly into the dependency-tracking framework as a GeneralComponent
+  would be very impractical and might lead to a lot of unnecessary things being
+  computed.  You have to be a bit careful where you put this component, and understand
+  what you're doing e.g. putting it in the path of a recurrence is a bit problematic
+  if the minibatch size is small.
+
+    Accepted configuration values:
+           dim          Dimension of the input and output
+           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+                        of 'dim'.  In this case, each block of dimension 'block-dim'
+                        is treated like a separate row of the input matrix, which
+                        means that the stats from n'th element of each
+                        block are pooled into one class, for each n.a
+           epsilon      Small term added to the variance that is used to prevent
+                        division by zero
+           target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
+                        it will normalize the standard deviation of the output to
+                        2.0. 'target-stddev' might be a more suitable name, but this
+                        was chosen for consistency with NormalizeComponent.
+ */
+class BatchNormComponent: public Component {
+ public:
+
+  BatchNormComponent() { }
+
+  // call this with 'true' to set 'test mode' where the batch normalization is
+  // done with stored stats.  There won't normally be any need to specially
+  // accumulate these stats; they are stored as a matter of course on each
+  // iteration of training, as for NonlinearComponents, and we'll use the stats
+  // from the most recent [script-level] iteration.
+  // (Note: it will refuse to actually set test-mode to true if there
+  // are no stats stored.)
+  void SetTestMode(bool test_mode);
+
+  // constructor using another component
+  BatchNormComponent(const BatchNormComponent &other);
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "BatchNormComponent"; }
+  virtual int32 Properties() const {
+    // If the block-dim is less than the dim, we need the input and output
+    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
+    // internally.  This is not much of a cost, because this will be used
+    // in convnets where we have to do this anyway.
+    return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|
+        kBackpropInPlace|
+        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
+        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new BatchNormComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void ZeroStats();
+
+
+  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
+
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
+                          const CuMatrixBase<BaseFloat> &out_value,
+                          void *memo);
+
+  // Members specific to this component type.
+  // Note: the offset and scale will only be nonempty in 'test mode'.
+  const CuVector<BaseFloat> &Offset() const { return offset_; }
+  const CuVector<BaseFloat> &Scale() const { return scale_; }
+
+ private:
+
+  struct Memo {
+    // number of frames (after any reshaping).
+    int32 num_frames;
+    // 'sum_sumsq_scale' is of dimension 5 by block_dim_:
+    // Row 0 = mean = the mean of the rows of the input
+    // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
+    // Row 2 = scale = the scale of the renormalization.
+    // Rows 3 and 4 are used as temporaries in Backprop.
+    CuMatrix<BaseFloat> mean_uvar_scale;
+  };
+
+  void Check() const;
+
+  // this function is used in a couple of places; it turns the raw stats into
+  // the offset/scale term of a normalizing transform.
+  static void ComputeOffsetAndScale(double count,
+                                    BaseFloat epsilon,
+                                    const Vector<double> &stats_sum,
+                                    const Vector<double> &stats_sumsq,
+                                    Vector<BaseFloat> *offset,
+                                    Vector<BaseFloat> *scale);
+  // computes derived parameters offset_ and scale_.
+  void ComputeDerived();
+
+  // Dimension of the input and output.
+  int32 dim_;
+  // This would normally be the same as dim_, but if it's less (and it must be >
+  // 0 and must divide dim_), then each separate block of the input of dimension
+  // 'block_dim_' is treated like a separate frame for the purposes of
+  // normalization.  This can be used to implement spatial batch normalization
+  // for convolutional setups-- assuming the filter-dim has stride 1, which it
+  // always will in the new code in nnet-convolutional-component.h.
+  int32 block_dim_;
+
+  // Used to avoid exact-zero variances, epsilon has the dimension of a
+  // covariance.
+  BaseFloat epsilon_;
+
+  // This value will normally be 1.0, which is the default, but you can set it
+  // to other values as a way to control how fast the following layer learns
+  // (smaller -> slower).  The same config exists in NormalizeComponent.
+  BaseFloat target_rms_;
+
+  // This is true if we want the batch normalization to operate in 'test mode'
+  // meaning the data mean and stddev used for the normalization are fixed
+  // quantities based on previously accumulated stats.  Note: the stats we use
+  // for this are based on the same 'StoreStats' mechanism as we use for
+  // components like SigmoidComponent and ReluComponent; we'll be using
+  // the stats from the most recent [script-level] iteration of training.
+  bool test_mode_;
+
+
+  // total count of stats stored by StoreStats().
+  double count_;
+  // sum-of-data component of stats of input data.
+  CuVector<double> stats_sum_;
+  // sum-of-squared component of stats of input data.
+  CuVector<double> stats_sumsq_;
+
+  // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they
+  // dictate the transform that is done in 'test mode'.  They are set only when
+  // reading the model from disk and when calling SetTestMode(true); they are
+  // resized to empty when the stats are updated, to ensure that out-of-date
+  // values are not kept around.
+  CuVector<BaseFloat> offset_;
+  CuVector<BaseFloat> scale_;
+};
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index bcb02184720..35614d62b34 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -143,7 +143,7 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
   KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
   if (!ApproxEqual(output, output_opt)) {
     KALDI_WARN << "Non-optimized and optimized versions of the computation give "
-               << "different outputs.";
+               << "different outputs: " << output << " vs. " << output_opt;
     return false;
   }
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 26aaced54df..c53fba815fb 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -21,7 +21,6 @@
 #include "nnet3/nnet-optimize-utils.h"
 #include "nnet3/nnet-optimize.h"
 
-
 namespace kaldi {
 namespace nnet3 {
 
@@ -737,9 +736,7 @@ bool VariableMergingOptimizer::MergeVariables() {
     // potentially merge into a single variable.
     const NnetComputation::Command &c = computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
-    // TODO: add kScale command and remove the check for 1.0
     if (c.command_type == kMatrixCopy &&
-        //        c.alpha == 1.0 &&
         config_.remove_assignments) {
       s2 = c.arg1;  // s2 is the written-to matrix.
       s1 = c.arg2;
@@ -997,7 +994,7 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   if (!left && !right)  // save some time.
     return std::pair<bool,bool>(false,false);
   bool is_assignment = (computation_->commands[command_index].command_type ==
-                                                kMatrixCopy &&
+                        kMatrixCopy &&
                         computation_->commands[command_index].alpha == 1.0);
   ComputationAnalysis analysis(*computation_, analyzer_);
   if (is_assignment) {
@@ -1018,6 +1015,268 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
 }
 
 
+// This class is used inside the function
+// `void ExtendMatrices(NnetComputation *computation)`;
+// see that function's declaration in nnet-optimize-utils.h for
+// a summary of what this class does.
+class MatrixExtender {
+ public:
+  typedef NnetComputation::SubMatrixInfo SubMatrixInfo;
+  typedef NnetComputation::MatrixInfo MatrixInfo;
+
+  MatrixExtender(NnetComputation *computation);
+
+  void ExtendMatrices();
+
+ private:
+  // This function returns true if a copy command from 'src_submatrix'
+  // to 'dest_submatrix' has the properties we need to be able to
+  // extend its rows to cover all of the source matrix.
+  bool CanBeExtended(int32 dest_submatrix_index,
+                     int32 src_submatrix_index);
+
+  // This actually extends the matrices... it's called only if CanBeExtended()
+  // with the same args returned true.  It modifies 'dest_submatrix_index'
+  // and 'src_submatrix_index'.
+  void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index);
+
+  // This function modifies the computation to fix certain problems
+  // that might have been introduced by Extend()... allocation, deallocation,
+  void FixComputation();
+
+  // This function modifies the computation to fix the debug info; if needed,
+  // it's called from FixComputation().
+  void FixDebugInfo();
+
+  // don't extend a destination matrix if it wasn't already
+  // at least 'min_proportion' (80%) big enough to store the source.
+  BaseFloat min_proportion_;
+
+  NnetComputation *computation_;
+
+  // Indexed by matrix-index m, orig_num_rows_[m] is the value of
+  // computation_->matrices[m].num_rows when this class was initialized,
+  // i.e. before we changed anything.
+  std::vector<int32> orig_num_rows_;
+
+  // Indexed by matrix-index m, this vector contains true if matrix
+  // m is involved in any AcceptInput() or ProvideOutput() operations.
+  std::vector<bool> is_input_or_output_;
+};
+
+// note: the initializer for min_proportion_ below needs to be kept in sync with
+// the min_proportion variable in
+// ComputationChecker::CheckComputationUndefined() in nnet-analyze.cc.
+MatrixExtender::MatrixExtender(NnetComputation *computation):
+    min_proportion_(0.8),
+    computation_(computation) {
+  int32 num_matrices = computation_->matrices.size();
+
+  { // set up orig_num_rows_.
+    orig_num_rows_.resize(num_matrices);
+    // matrix 0 is not a real matrix so skip that index.
+    for (int32 m = 1; m < num_matrices; m++)
+      orig_num_rows_[m] = computation_->matrices[m].num_rows;
+  }
+  { // set up is_input_or_output_.
+    is_input_or_output_.resize(num_matrices, false);
+    std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+    for (; command_iter != command_end; ++command_iter) {
+      const NnetComputation::Command &command = *command_iter;
+      // make sure there are no kSwapMatrix commands; they should not be present
+      // at this stage of optimization.
+      KALDI_ASSERT(command.command_type != kSwapMatrix);
+      if (command.command_type == kProvideOutput ||
+          command.command_type == kAcceptInput) {
+        int32 s = command.arg1,
+            m = computation_->submatrices[s].matrix_index;
+        is_input_or_output_[m] = true;
+      }
+    }
+  }
+}
+
+
+bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index,
+                                   int32 src_submatrix_index) {
+  const SubMatrixInfo
+      &src_submatrix = computation_->submatrices[src_submatrix_index],
+      &dest_submatrix = computation_->submatrices[dest_submatrix_index];
+  if (src_submatrix.matrix_index == dest_submatrix.matrix_index)
+    return false;
+
+  // we can't resize the destination matrix if it's involved in input or output.
+  if (is_input_or_output_[dest_submatrix.matrix_index])
+    return false;
+
+  const MatrixInfo
+      &src_matrix = computation_->matrices[src_submatrix.matrix_index];
+
+  int32 dest_matrix_orig_num_rows = orig_num_rows_[dest_submatrix.matrix_index],
+      src_matrix_orig_num_rows = orig_num_rows_[src_submatrix.matrix_index];
+
+  if (src_submatrix.num_rows < min_proportion_ * src_matrix_orig_num_rows)
+    return false;
+
+  // The following checks that the source submatrix covers be all of the
+  // source matrix except a few final rows, and the destination submatrix goes
+  // to the final row of its matrix.
+  return (src_submatrix.col_offset == 0 &&
+          src_submatrix.num_cols == src_matrix.num_cols &&
+          src_submatrix.row_offset == 0 &&
+          src_submatrix.num_rows < src_matrix.num_rows &&
+          dest_submatrix.row_offset + dest_submatrix.num_rows ==
+          dest_matrix_orig_num_rows);
+}
+
+
+void MatrixExtender::Extend(int32 *dest_submatrix_index,
+                            int32 *src_submatrix_index) {
+  // copy the SubMatrixInfo to avoid iterator invalidation.
+  SubMatrixInfo
+      src_submatrix = computation_->submatrices[*src_submatrix_index],
+      dest_submatrix = computation_->submatrices[*dest_submatrix_index];
+
+  MatrixInfo  &src_matrix = computation_->matrices[src_submatrix.matrix_index],
+      &dest_matrix = computation_->matrices[dest_submatrix.matrix_index];
+
+  int32 new_dest_num_rows = dest_submatrix.row_offset + src_matrix.num_rows;
+
+  // extend the destination matrix so it has enough rows to fit the entire
+  // source matrix.  Note: doing this will break certain invariances in the
+  // computation, principally with allocation and deallocation commands, which
+  // we'll later fix up by calling FixComputation().
+  if (new_dest_num_rows > dest_matrix.num_rows) {
+    dest_matrix.num_rows = new_dest_num_rows;
+    // make sure there's a submatrix index covering the whole of the dest matrix.
+    computation_->submatrices.push_back(
+        SubMatrixInfo(dest_submatrix.matrix_index, 0, new_dest_num_rows,
+                      0, dest_matrix.num_cols));
+  }
+
+  // The following 3 statements create a new submatrix that will be
+  // the destination submatrix; it's the same as the original destination
+  // submatrix, but with a few extra rows.
+  *dest_submatrix_index = computation_->submatrices.size();
+  dest_submatrix.num_rows = src_matrix.num_rows;
+  computation_->submatrices.push_back(
+      SubMatrixInfo(dest_submatrix));
+
+  // The following 3 statements create a new submatrix that will be
+  // the source submatrix; it's the same as the original source
+  // submatrix, but with a few extra rows, and actually will cover
+  // the entire source matrix.
+  *src_submatrix_index = computation_->submatrices.size();
+  computation_->submatrices.push_back(
+      SubMatrixInfo(src_submatrix.matrix_index, 0, src_matrix.num_rows,
+                    0, src_matrix.num_cols));
+}
+
+void MatrixExtender::ExtendMatrices() {
+  std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+  bool changed = false;
+  for (; command_iter != command_end; ++command_iter) {
+    NnetComputation::Command &command = *command_iter;
+    if (command.command_type == kMatrixCopy &&
+        command.alpha == 1.0) {
+      int32 dest_submatrix_index = command.arg1,
+          src_submatrix_index = command.arg2;
+      if (CanBeExtended(dest_submatrix_index, src_submatrix_index)) {
+        Extend(&command.arg1, &command.arg2);
+        changed = true;
+      }
+    }
+  }
+  if (changed)
+    FixComputation();
+}
+
+void MatrixExtender::FixComputation() {
+  // make sure that allocation and deallocation commands
+  // operate on whole matrix.
+  std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+  std::vector<int32> whole_submatrices;
+  computation_->GetWholeSubmatrices(&whole_submatrices);
+  for (; command_iter != command_end; ++command_iter) {
+    NnetComputation::Command &command = *command_iter;
+    if (command.command_type == kAllocMatrix ||
+        command.command_type == kDeallocMatrix) {
+      int32 s = command.arg1,
+          m = computation_->submatrices[s].matrix_index,
+          new_s = whole_submatrices[m];
+      if (new_s != s) {
+        KALDI_ASSERT(
+            computation_->submatrices[s] == computation_->submatrices[new_s] ||
+            orig_num_rows_[m] != computation_->matrices[m].num_rows);
+        command.arg1 = new_s;
+      }
+    }
+    if (command.command_type == kSetConst && command.alpha == 0.0) {
+      int32 s = command.arg1,
+          m = computation_->submatrices[s].matrix_index,
+          new_s = whole_submatrices[m];
+      if (new_s != s) {
+        {
+          const NnetComputation::SubMatrixInfo &info = computation_->submatrices[
+              command.arg1];
+          const NnetComputation::MatrixInfo &mat_info = computation_->matrices[
+              info.matrix_index];
+          // If this command wasn't zeroing the the entirety of a matrix,
+          // (before we extended the matrix), we don't need to extend it.
+          if (!(info.row_offset == 0 && info.col_offset == 0 &&
+                info.num_cols == mat_info.num_cols &&
+                info.num_rows == orig_num_rows_[info.matrix_index]))
+            continue;
+          // I know doing this via 'continue' is odd, but it's done this way to
+          // avoid invalid iterators still being in scope; I think some runtimes
+          // check for it.
+        }
+        command.arg1 = new_s;
+      }
+    }
+  }
+  if (!computation_->matrix_debug_info.empty())
+    FixDebugInfo();
+  RenumberComputation(computation_);
+}
+
+void MatrixExtender::FixDebugInfo() {
+  int32 num_matrices = computation_->matrices.size();
+  // matrix zero is not a 'real' matrix.
+  for (int32 m = 1; m < num_matrices; m++) {
+    NnetComputation::MatrixDebugInfo &debug_info =
+        computation_->matrix_debug_info[m];
+    int32 new_num_rows = computation_->matrices[m].num_rows,
+        old_num_rows = debug_info.cindexes.size();
+    if (new_num_rows != old_num_rows) {
+      debug_info.cindexes.resize(new_num_rows);
+      int32 num_extra_rows = new_num_rows - old_num_rows;
+      // the following should be true because min_proportion_ > 0.5.
+      KALDI_ASSERT(num_extra_rows <= old_num_rows);
+      for (int32 r = old_num_rows; r < new_num_rows; r++) {
+        Cindex cindex = debug_info.cindexes[r - num_extra_rows];
+        // set the 't' value to kNoTime which indicates that it's not a 'real'
+        // time step, and may avoid errors in checking code.
+        cindex.second.t = kNoTime;
+        debug_info.cindexes[r] = cindex;
+      }
+    }
+  }
+}
+
+void ExtendMatrices(NnetComputation *computation) {
+  MatrixExtender ext(computation);
+  ext.ExtendMatrices();
+}
+
+
+
 /** This class is responsible for consolidating the model-update part of
     backprop commands, for components in (e.g.) recurrent networks that need to
     have many separate backprop commands, into more efficient single commands
@@ -2555,7 +2814,8 @@ static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N,
 
 // This class implements the internals of the ExpandComputation() function (used
 // in shortcut compilation); see comment by the declaration of
-// ExpandComputation() in nnet-optimize-utils.h for overview.
+// ExpandComputation() in nnet-optimize-utils.h for overview.  (It relates to
+// shortcut compilation).
 class ComputationExpander {
  public:
   ComputationExpander(const Nnet &nnet,
@@ -2952,6 +3212,7 @@ void ComputationExpander::ComputeCommands() {
       case kAddRowRanges:
         ExpandRowRangesCommand(c, &c_out);
         break;
+      case kCompressMatrix: case kDecompressMatrix:
       case kAcceptInput: case kProvideOutput: case kNoOperation:
       case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel: case kGotoLabel:
@@ -3466,13 +3727,12 @@ class ComputationLoopedOptimizer {
   /// expected to be command indexes of the kNoOperationMarker at segment
   /// boundaries, this function outputs for each of these command indexes a list
   /// of matrices which are 'active' at that point in time.  By 'active' we mean
-  /// that the matrix has been written to before that time (note, we don't count
-  /// initialization with zeros as being written to); and will be read after
-  /// that time.  These is the list of matrices that 'need to be in scope'
-  /// at those points in time.  '*active_matrices' is indexed by the
-  /// same index as 'splice_point_commands', and is then a list of active
-  /// matrices, in numerical order of matrix index.
-  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
+  /// that the matrix has been written to before that time (including zeroing),
+  /// and will be read after that time.  These is the list of matrices that
+  /// 'need to be in scope' at those points in time.  '*active_matrices' is
+  /// indexed by the same index as 'splice_point_commands', and is then a list
+  /// of active matrices, in numerical order of matrix index.  Note: for each i,
+  /// (*active_matrices)[i] will be sorted and unique.
   static void FindActiveMatrices(const NnetComputation &computation,
                                  const Analyzer &analyzer,
                                  const std::vector<int32> &splice_point_commands,
@@ -4045,5 +4305,328 @@ void RemoveCommandsForUnusedMatrix(const Analyzer &analyzer,
   }
 }
 
+
+
+// This comparison operator is used in the function InsertCommands()
+// to sort a list of these pairs by the .first element.
+struct CommandPairComparator {
+  // operator () should be viewed as a '<' operator that only looks at
+  // the .first element, treating the .second elements as equal.
+  bool operator () (const std::pair<int32, NnetComputation::Command> &p1,
+                    const std::pair<int32, NnetComputation::Command> &p2) const {
+    return p1.first < p2.first;
+  }
+};
+
+void InsertCommands(
+    std::vector<std::pair<int32, NnetComputation::Command> > *new_commands,
+    NnetComputation *computation) {
+  int32 num_new_commands = new_commands->size(),
+      num_old_commands = computation->commands.size();
+  if (num_new_commands == 0)
+    return;
+  CommandPairComparator comparison_operator;
+  // use std::stable_sort so that for entries in 'new_commands' that
+  // have the same .first value, they stay in the same order they were
+  // in before sorting.
+  std::stable_sort(new_commands->begin(), new_commands->end(),
+                   comparison_operator);
+
+  if (RandInt(0, 3) == 0) {   // check 'new_commands'
+    for (int32 i = 0; i + 1 < num_new_commands; i++) {
+      KALDI_ASSERT((*new_commands)[i].first <= (*new_commands)[i+1].first &&
+                   (*new_commands)[i].first >= 0 &&
+                   (*new_commands)[i+1].first <= num_old_commands);
+    }
+  }
+  std::vector<NnetComputation::Command> merged_commands;
+  merged_commands.reserve(num_old_commands + num_new_commands);
+
+  std::vector<std::pair<int32, NnetComputation::Command> >::const_iterator
+      new_commands_iter = new_commands->begin(),
+      new_commands_end = new_commands->end();
+
+  for (int32 old_command_index = 0; old_command_index <= num_old_commands;
+       old_command_index++) {
+    while (new_commands_iter != new_commands_end &&
+           new_commands_iter->first <= old_command_index) {
+      merged_commands.push_back(new_commands_iter->second);
+      ++new_commands_iter;
+    }
+    if (old_command_index < num_old_commands)
+      merged_commands.push_back(computation->commands[old_command_index]);
+  }
+  KALDI_ASSERT(merged_commands.size() == num_old_commands +
+               num_new_commands);
+  // copy to 'computation->commands' via shallow swap.
+  computation->commands.swap(merged_commands);
+  FixGotoLabel(computation);
+}
+
+/**
+   This class is used in the function OptimizeMemoryCompression(),
+   once we determine that there is some potential to do memory compression
+   for this computation.
+ */
+class MemoryCompressionOptimizer {
+ public:
+
+  /** @param [in] nnet         The neural net the computation is for.
+      @param [in] memory_compression_level.  The level of compression:
+         0 = no compression (the constructor should not be calle with this value).
+         1 = compression that doesn't affect the results (but still takes time).
+         2 = compression that affects the results only very slightly
+         3 = compression that affects the results a little more.
+      @param [in] middle_command  Must be the command-index of the
+          command of type kNoOperationMarker in 'computation'.
+      @param [in,out] computation  The computation we're optimizing.
+  */
+  MemoryCompressionOptimizer(const Nnet &nnet,
+                             int32 memory_compression_level,
+                             int32 middle_command,
+                             NnetComputation *computation):
+      nnet_(nnet), memory_compression_level_(memory_compression_level),
+      middle_command_(middle_command), computation_(computation) { }
+
+  void Optimize();
+ private:
+
+  // This function, called from Compress(), figures out whether we can compress
+  // matrix m, and if so, adds an entry to compress_info_.
+  void ProcessMatrix(int32 m);
+
+  // This function modifies the commands in '*computation_', taking
+  // as input the commands in compress_info_.
+  void ModifyComputation();
+
+  // While deciding what matrices to compress we will create a list of structs
+  // of type MatrixCompressInfo.  Later we copy-and-modify the commands in the
+  // computation, putting the compression commands into their appropriate place.
+  struct MatrixCompressInfo {
+    // m is the matrix-index of the matrix we're going to compress.
+    int32 m;
+    // compression_command_index is the command-index of the command
+    // *after* which we will place the compression command.  Normally
+    // this will be some type of propagation.
+    int32 compression_command_index;
+    // compression_command_index is the command-index of the command
+    // *before* which we will place the uncompression command.  Normally
+    // this will be some type of backprop.
+    int32 uncompression_command_index;
+    // 'compression_type' (e.g. kCompressedMatrixInt8) determines the type
+    // we compress the BaseFloats to.
+    CuCompressedMatrixType compression_type;
+    // 'range' determines range of values that the compressed values can
+    // be in: for signed types they are in [-range, range], for unsigned
+    // types, in [0, range].
+    // As a special case, range = 0 means that the compression just stores the
+    // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this
+    // is useful for ReLUs.
+    BaseFloat range;
+    // this is provided to the initializer of CuCompressedMatrix; it should
+    // be true if the values being compressed are potentially outside of
+    // the representable range.
+    bool truncate;
+    MatrixCompressInfo(int32 m, int32 forward_command_index,
+                       int32 backward_command_index,
+                       CuCompressedMatrixType compression_type,
+                       BaseFloat range, bool truncate):
+        m(m), compression_command_index(forward_command_index),
+        uncompression_command_index(backward_command_index),
+        compression_type(compression_type), range(range),
+        truncate(truncate) { }
+
+  };
+  std::vector<MatrixCompressInfo> compress_info_;
+
+  const Nnet &nnet_;
+  int32 memory_compression_level_;
+  int32 middle_command_;
+  NnetComputation *computation_;
+  Analyzer analyzer_;
+};
+
+
+void MemoryCompressionOptimizer::ModifyComputation() {
+  // whole_submatrices[m] is the submatrix-index of the submatrix that
+  // represents the whole of matrix m.
+  std::vector<int32> whole_submatrices;
+  computation_->GetWholeSubmatrices(&whole_submatrices);
+
+  // 'pairs_to_insert' will be a list of pairs (command-index, command),
+  // meaning: (command-index just before which to insert this command; command
+  // to insert).
+  std::vector<std::pair<int32, NnetComputation::Command> >
+      pairs_to_insert;
+  pairs_to_insert.reserve(compress_info_.size() * 2);
+  for (size_t i = 0; i < compress_info_.size(); i++) {
+    const MatrixCompressInfo &info = compress_info_[i];
+    int32 s = whole_submatrices[info.m];
+    // below we use compression_command_index + 1 because we want the
+    // compression to go after the command in 'info.compression_command_index'
+    // (which might be, for instance, a forward propagation command).
+    std::pair<int32, NnetComputation::Command> p1(
+        info.compression_command_index + 1,
+        NnetComputation::Command(info.range, kCompressMatrix,
+                                 s, static_cast<int32>(info.compression_type),
+                                 info.truncate ? 1 : 0));
+    pairs_to_insert.push_back(p1);
+    std::pair<int32, NnetComputation::Command> p2(
+        info.uncompression_command_index,
+        NnetComputation::Command(1.0, kDecompressMatrix, s));
+    pairs_to_insert.push_back(p2);
+  }
+  InsertCommands(&pairs_to_insert,
+                 computation_);
+}
+
+
+void MemoryCompressionOptimizer::Optimize() {
+  analyzer_.Init(nnet_, *computation_);
+  // note: matrix zero is not really a matrix.
+  int32 num_matrices = computation_->matrices.size();
+  for (int32 m = 1; m < num_matrices; m++)
+    ProcessMatrix(m);
+  if (!compress_info_.empty())
+    ModifyComputation();
+}
+
+void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
+  if (analyzer_.matrix_accesses[m].is_output) {
+    return;  // We can't do this optimization for matrices that are going to be
+             // output to the user.
+  }
+
+  // 'accesses' list the commands that access this matrix.
+  const std::vector<Access> &accesses = analyzer_.matrix_accesses[m].accesses;
+  // the 'kReadAccess' below is actually a don't-care  This is just
+  // to find the position in 'accesses' that corresponds to command-index
+  // 'middle_command'.
+  Access middle_access(middle_command_, kReadAccess);
+  std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
+                                                              accesses.end(),
+                                                              middle_access);
+  // At this point, 'iter' points to the first access in 'accesses'
+  // whose command index is >= 'middle_command_' (which separates the forward
+  // and backward passes), or accesses.end() if this matrix was not
+  // accessed during the backward pass.
+  if (iter == accesses.end()) {
+    return;  // There is nothing to do: this matrix was not accessed during the
+             // backward pass.
+  }
+  if (iter == accesses.begin()) {
+    return;  // There is nothing to do: this matrix was not accessed during the
+             // forward pass.
+  }
+  // 'backward_access' is the first access of the matrix in the backward
+  // pass of the computation, and
+  // 'forward_access' is the last access of the matrix in the forward pass
+  // of the computation.
+  const Access &backward_access = iter[0],
+      &forward_access = iter[-1];
+  KALDI_ASSERT(forward_access.command_index < middle_command_ &&
+               backward_access.command_index > middle_command_);
+
+  // 'backward_access_is_last_access' is going to be set to true if
+  // 'backward_access' is the last command to access the matrix (apart from
+  // deallocation or matrix-swap commands, which don't show up in the list of
+  // accesses).
+  bool backward_access_is_last_access = (accesses.end() == iter + 1);
+
+  int32 backward_command_index = backward_access.command_index,
+      forward_command_index = forward_access.command_index;
+  NnetComputation::Command
+      &backward_command = computation_->commands[backward_command_index];
+
+  if (memory_compression_level_ >= 1 &&
+      backward_access_is_last_access &&
+      backward_access.access_type == kReadAccess &&
+      backward_command.command_type == kBackprop) {
+    int32 component_index = backward_command.arg1;
+    const Component *component = nnet_.GetComponent(component_index);
+    // this is potentially a candidate for our optimization for ReLU units,
+    // where we only need to store the sign.
+    if (component->Type() == "RectifiedLinearComponent") {
+      compress_info_.push_back(
+          MatrixCompressInfo(m, forward_command_index,
+                             backward_command_index,
+                             kCompressedMatrixUint8, 0.0,
+                             true));
+      return;
+    }
+  }
+
+  // If memory_compression_level >= 2 (an "intermediate" level of compression),
+  // then we'll consider compressing quantities using 16 bits in the range
+  // [-10, 10].  Because of the way this compression works, exact zero will
+  // still be uncompressed as exact zero, so even if this is the output
+  // of a ReLU, it's OK.  (Having a few derivatives zero for ReLU outputs
+  // that were very close to zero is OK.)
+  if (memory_compression_level_ >= 2) {
+    compress_info_.push_back(
+        MatrixCompressInfo(m, forward_command_index,
+                           backward_command_index,
+                           kCompressedMatrixInt16, 10.0,
+                           true));
+    return;
+  }
+
+  // TODO: later maybe implement something for memory compression level = 3.
+}
+
+
+
+
+void OptimizeMemoryCompression(const Nnet &nnet,
+                               int32 memory_compression_level,
+                               NnetComputation *computation) {
+  if (memory_compression_level == 0 || computation->commands.empty())
+    return;
+  // don't apply this optimization to looped computations.
+  if (computation->commands.back().command_type == kGotoLabel)
+    return;
+
+  // 'middle_command' will be the index of the command of type
+  // 'kNoOperationMarker' that separates the forward and backward
+  // passes.  If it doesn't exist, it means this computation doesn't
+  // include
+  int32 middle_command = -1;
+  for (size_t i = 0; i < computation->commands.size(); i++) {
+    if (computation->commands[i].command_type == kNoOperationMarker) {
+      if (middle_command < 0) {
+        middle_command = static_cast<int32>(i);
+      } else {
+        KALDI_WARN << "Found more than one command of type kNoOperationMarker "
+            "in non-looped computation.";
+        // there are more than one command of this type... this wasn't expected.
+        // return (i.e. do nothing).
+        return;
+      }
+    }
+  }
+  if (middle_command == -1) {
+    return;  // This computation doesn't have a backprop pass.
+  }
+  if (memory_compression_level >= 1) {
+    int64 bytes_used_initial, bytes_used_final;
+    if (GetVerboseLevel() >= 2)
+      bytes_used_initial = GetMaxMemoryUse(*computation);
+
+    MemoryCompressionOptimizer opt(nnet, memory_compression_level,
+                                   middle_command, computation);
+    opt.Optimize();
+
+    if (GetVerboseLevel() >= 2) {
+      bytes_used_final = GetMaxMemoryUse(*computation);
+      if (bytes_used_final != bytes_used_initial) {
+        KALDI_VLOG(2) << "Memory compression reduced  memory use from "
+                      << bytes_used_initial << " to "
+                      << bytes_used_final << " bytes.";
+      }
+    }
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 98615e2e146..703f43af095 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -181,6 +181,16 @@ class VariableMergingOptimizer {
   bool already_called_merge_variables_;
 };
 
+/**
+   This is not really an optimization in itself but it can make things easier
+   for class VariableMergingOptimizer (usually called by its wrapper
+   VariableMergingOptimization()).  It looks for a case where most of a matrix
+   (but not its final rows) are copied to some submatrix of another matrix,
+   where the row-range of that submatrix extends to the last row of the other
+   matrix; and it extends the other matrix with additional rows so that the
+   entire source matrix can be copied to the destination.
+ */
+void ExtendMatrices(NnetComputation *computation);
 
 
 /**
@@ -524,6 +534,46 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
+/// Inserts commands into the computation at the requested places.  'commands'
+///  is a list of pairs (command-index, command) that is expected to be sorted
+///  on command-index.  For each entry (c, command) in 'commands', 'command' is
+///  inserted into 'computation' just *before* the command that (at entry) is in
+///  computation->commands[c].  If there are multiple pairs with the same index
+///  c, they will remain in the same order in which they were present in
+///  'commands'; however, 'commands' does not have to be sorted on 'c'.
+///  As a special case, if c == computation->commands.size(), the
+///  corresponding commands are inserted at the beginning of the computation.
+///  This function will appropriately renumber the argument of the kGotoLabel
+///  command of any 'looped' computation.  Command indexes c in commands[*].first
+///  must be in the range [0, computation->commands.size()].
+///  This function may modify 'commands' by sorting it.
+void InsertCommands(
+    std::vector<std::pair<int32, NnetComputation::Command> > *commands,
+    NnetComputation *computation);
+
+/// Performs optimization to reduce memory usage where possible,
+/// making use of the kCompressMatrix and kDecompressMatrix commands.
+/// Should only be done after most other optimizations, because some
+/// optimizations (such as variable-merging) would not work correctly
+/// after doing this optimization.  This does nothing for looped
+/// computations.  It's OK, though, to expand a shortcut computation
+/// (i.e. call ExpandComputation) after doing this.
+///
+/// memory_compression_level determines how aggressive the compression
+/// is.  Allowed values:
+///       0 = no compression at all
+///       1 = compression that doesn't affect results (e.g. compress
+///           ReLU outputs to 1 byte, as just the sign is needed).
+///       2 = compression that may affect the results slightly (e.g. 16-bit
+///           compression of the output of NormalizeComponent and the like),
+///           but this is not implemented yet, so equivalent to 1.
+///       3 = compression that may affect the results more than just
+///           slightly.  Not implemented yet, so equivalent to 1.
+void OptimizeMemoryCompression(const Nnet &nnet,
+                               int32 memory_compression_level,
+                               NnetComputation *computation);
+
+
 /// This function tries to optimize computation 'computation' for an 'looped'
 /// computation.  It expects as input a computation with no backprop but with
 /// multiple 'segments' separated by command kNoOperationLabel, where each
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 7824ee88b5a..d614afce7d0 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -439,7 +439,7 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
             case kMatrixAdd: c.command_type = kMatrixCopy;
               break;
             case kAddRows: c.command_type = kCopyRows;
-              break;
+               break;
             case kAddRowsMulti: c.command_type = kCopyRowsMulti;
               break;
             // note: kCopyToRowsMulti does not currently support alpha != 1.0.
@@ -515,13 +515,6 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, true);
   }
 
-  if (config.optimize &&
-      (config.remove_assignments || config.backprop_in_place ||
-       config.propagate_in_place)) {
-    VariableMergingOptimization(config, nnet, computation);
-    if (GetVerboseLevel() >= 3)
-      CheckComputation(nnet, *computation, false);
-  }
 
   if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
     bool must_renumber = false;
@@ -536,6 +529,21 @@ void Optimize(const NnetOptimizeOptions &config,
     }
   }
 
+  if (config.optimize && config.extend_matrices &&
+      !config.optimize_looped_computation) {
+    ExtendMatrices(computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
+
+
+  if (config.optimize &&
+      (config.remove_assignments || config.backprop_in_place ||
+       config.propagate_in_place)) {
+    VariableMergingOptimization(config, nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
 
   if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
@@ -543,7 +551,9 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
-  if (config.optimize && config.move_sizing_commands) {
+
+  if ((config.optimize && config.move_sizing_commands) ||
+      config.optimize_looped_computation) {
     MoveSizingCommands(nnet, computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
@@ -552,7 +562,7 @@ void Optimize(const NnetOptimizeOptions &config,
   // the looped computation optimization has to go before
   // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
   // because it's necessary for looped computation to run.
-  if (config.optimize_looped_computation){
+  if (config.optimize_looped_computation) {
     OptimizeLoopedComputation(nnet, computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
@@ -577,11 +587,21 @@ void Optimize(const NnetOptimizeOptions &config,
   if (config.optimize_looped_computation)
     FixGotoLabel(computation);
 
+
+  if (config.memory_compression_level > 0 &&
+      !config.optimize_looped_computation) {
+    OptimizeMemoryCompression(nnet, config.memory_compression_level,
+                              computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
+
   if (GetVerboseLevel() >= 3) {
     CheckComputation(nnet, *computation, false);
     KALDI_LOG << "After optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);
   }
+
 }
 
 // ComputationRequests are distinguished by the names and indexes
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 4ffa4de449e..31872e46b72 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -32,12 +32,14 @@ namespace nnet3 {
 // Options class for optimizing a NnetComputation.  The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
+// See the Register() function below for option-specific documentation.
 struct NnetOptimizeOptions {
   bool optimize;  // setting this false disallow all optimization.
   bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
   bool optimize_row_ops;
+  bool extend_matrices;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -49,6 +51,7 @@ struct NnetOptimizeOptions {
   int32 max_deriv_time;
   int32 max_deriv_time_relative;
   bool snip_row_ops;
+  int32 memory_compression_level;
   // optimize_looped_computation is a 'hidden config' not available from
   // the command line; it's set to true to enable the optimization for
   // looped computation that turns a linear computation into a loop.
@@ -60,6 +63,7 @@ struct NnetOptimizeOptions {
       propagate_in_place(true),
       backprop_in_place(true),
       optimize_row_ops(true),
+      extend_matrices(true),
       convert_addition(true),
       remove_assignments(true),
       allow_left_merge(true),
@@ -71,6 +75,7 @@ struct NnetOptimizeOptions {
       max_deriv_time(std::numeric_limits<int32>::max()),
       max_deriv_time_relative(std::numeric_limits<int32>::max()),
       snip_row_ops(true),
+      memory_compression_level(1),
       optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
@@ -84,6 +89,9 @@ struct NnetOptimizeOptions {
                    "disable optimization that allows in-place propagation");
     opts->Register("backprop-in-place", &backprop_in_place, "Set to false to "
                    "disable optimization that allows in-place backprop");
+    opts->Register("extend-matrices", &extend_matrices, "This optimization "
+                   "can reduce memory requirements for TDNNs when applied "
+                   "together with --convert-addition=true");
     opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
                    "disable certain optimizations that act on operations of "
                    "type *Row*.");
@@ -123,6 +131,14 @@ struct NnetOptimizeOptions {
     opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
                    "disable an optimization that reduces the size of certain "
                    "per-row operations");
+    opts->Register("memory-compression-level", &memory_compression_level,
+                   "This is only relevant to training, not decoding.  Set this "
+                   "to 0,1,2; higher levels are more aggressive at reducing "
+                   "memory by compressing quantities needed for backprop, "
+                   "potentially at the expense of speed and the accuracy "
+                   "of derivatives.  0 means no compression at all; 1 means "
+                   "compression that shouldn't affect results at all.");
+
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 2c4da825013..bb3a209460a 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -481,7 +481,7 @@ static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) {
 
 // Returns a string that summarizes a vector fairly succintly, for
 // printing stats in info lines.
-std::string SummarizeVector(const Vector<BaseFloat> &vec) {
+std::string SummarizeVector(const VectorBase<BaseFloat> &vec) {
   std::ostringstream os;
   if (vec.Dim() < 10) {
     os << "[ ";
@@ -517,6 +517,16 @@ std::string SummarizeVector(const Vector<BaseFloat> &vec) {
   return os.str();
 }
 
+std::string SummarizeVector(const VectorBase<double> &vec) {
+  Vector<BaseFloat> vec_copy(vec);
+  return SummarizeVector(vec_copy);
+}
+
+std::string SummarizeVector(const CuVectorBase<BaseFloat> &cu_vec) {
+  Vector<BaseFloat> vec(cu_vec);
+  return SummarizeVector(vec);
+}
+
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuVectorBase<BaseFloat> &params,
@@ -537,7 +547,10 @@ void PrintParameterStats(std::ostringstream &os,
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuMatrix<BaseFloat> &params,
-                         bool include_mean) {
+                         bool include_mean,
+                         bool include_row_norms,
+                         bool include_column_norms,
+                         bool include_singular_values) {
   os << std::setprecision(4);
   os << ", " << name << '-';
   int32 dim = params.NumRows() * params.NumCols();
@@ -551,8 +564,26 @@ void PrintParameterStats(std::ostringstream &os,
     os << "rms=" << rms;
   }
   os << std::setprecision(6);  // restore the default precision.
-  if (GetVerboseLevel() >= 2) {
-    // At verbose level >= 2, print stats of the singular values of the matrix.
+
+  if (include_row_norms) {
+    CuVector<BaseFloat> row_norms(params.NumRows());
+    row_norms.AddDiagMat2(1.0, params, kNoTrans, 0.0);
+    row_norms.ApplyPow(0.5);
+    Vector<BaseFloat> row_norms_cpu;
+    row_norms.Swap(&row_norms_cpu);
+    os << ", " << name << "-row-norms="
+       << SummarizeVector(row_norms_cpu);
+  }
+  if (include_column_norms) {
+    CuVector<BaseFloat> col_norms(params.NumCols());
+    col_norms.AddDiagMat2(1.0, params, kTrans, 0.0);
+    col_norms.ApplyPow(0.5);
+    Vector<BaseFloat> col_norms_cpu;
+    col_norms.Swap(&col_norms_cpu);
+    os << ", " << name << "-col-norms="
+       << SummarizeVector(col_norms_cpu);
+  }
+  if (include_singular_values) {
     Matrix<BaseFloat> params_cpu(params);
     Vector<BaseFloat> s(std::min(params.NumRows(), params.NumCols()));
     params_cpu.Svd(&s);
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index fef21301ff6..0b2e0041aaa 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -189,9 +189,16 @@ std::string ErrorContext(std::istream &is);
 
 std::string ErrorContext(const std::string &str);
 
-// Returns a string that summarizes a vector fairly succintly, for
-// printing stats in info lines.
-std::string SummarizeVector(const Vector<BaseFloat> &vec);
+/** Returns a string that summarizes a vector fairly succintly, for
+    printing stats in info lines.  For example:
+   "[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.003,0.003,0.004 \
+      0.005,0.01,0.07,0.11,0.14 0.18,0.24,0.29,0.39), mean=0.0745, stddev=0.0611]"
+*/
+std::string SummarizeVector(const VectorBase<BaseFloat> &vec);
+
+std::string SummarizeVector(const VectorBase<double> &vec);
+
+std::string SummarizeVector(const CuVectorBase<BaseFloat> &vec);
 
 /** Print to 'os' some information about the mean and standard deviation of
     some parameters, used in Info() functions in nnet-simple-component.cc.
@@ -213,13 +220,25 @@ void PrintParameterStats(std::ostringstream &os,
      PrintParameterStats(os, "linear-params", linear_params_;
     would print to 'os' something like the string
      ", linear-params-rms=0.239".
-    If you set include_mean to true, it will print something like
+    If you set 'include_mean' to true, it will print something like
     ", linear-params-{mean-stddev}=0.103,0.183".
+    If you set 'include_row_norms' to true, it will print something
+    like
+    ", linear-params-row-norms=[percentiles(0,1........, stddev=0.0508]"
+    If you set 'include_column_norms' to true, it will print something
+    like
+    ", linear-params-col-norms=[percentiles(0,1........, stddev=0.0508]"
+    If you set 'include_singular_values' to true, it will print something
+    like
+    ", linear-params-singular-values=[percentiles(0,1........, stddev=0.0508]"
  */
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuMatrix<BaseFloat> &params,
-                         bool include_mean = false);
+                         bool include_mean = false,
+                         bool include_row_norms = false,
+                         bool include_column_norms = false,
+                         bool include_singular_values = false);
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index c6d2c1f7952..b3cf89ae6b4 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -313,181 +313,6 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "</ElementwiseProductComponent>");
 }
 
-const BaseFloat NormalizeComponent::kSquaredNormFloor =
-    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
-
-NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
-    input_dim_(other.input_dim_), block_dim_(other.block_dim_),
-    target_rms_(other.target_rms_),
-    add_log_stddev_(other.add_log_stddev_) { }
-
-void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
-  input_dim_ = 0;
-  add_log_stddev_ = false;
-  target_rms_ = 1.0;
-  bool ok = cfl->GetValue("dim", &input_dim_) ||
-      cfl->GetValue("input-dim", &input_dim_);
-  block_dim_ = input_dim_;
-  cfl->GetValue("block-dim", &block_dim_);
-  cfl->GetValue("target-rms", &target_rms_);
-  cfl->GetValue("add-log-stddev", &add_log_stddev_);
-  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 ||
-      block_dim_ <= 0 || input_dim_ % block_dim_ != 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-}
-
-void NormalizeComponent::Read(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<NormalizeComponent>") {
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_); // Read dimension.
-  ReadToken(is, binary, &token);
-  if (token == "<BlockDim>") {
-    ReadBasicType(is, binary, &block_dim_);
-    ReadToken(is, binary, &token);
-  } else {
-    block_dim_ = input_dim_;
-  }
-  // read target_rms_ if it is available.
-  if (token == "<TargetRms>") {
-    ReadBasicType(is, binary, &target_rms_);
-    ReadToken(is, binary, &token);
-  }
-  //  Read add_log_stddev_ token, if it is available.
-  if (token == "<AddLogStddev>") {
-    ReadBasicType(is, binary, &add_log_stddev_);
-    ReadToken(is, binary, &token);
-  } else {
-    add_log_stddev_ = false;
-  }
-  if (token == "<ValueAvg>") {
-    // back-compatibility code.
-    CuVector<double> temp;
-    temp.Read(is, binary);
-    ExpectToken(is, binary, "<DerivAvg>");
-    temp.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    double count;
-    ReadBasicType(is, binary, &count);
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "</NormalizeComponent>");
-}
-
-void NormalizeComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<NormalizeComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  if (block_dim_ != input_dim_) {
-    WriteToken(os, binary, "<BlockDim>");
-    WriteBasicType(os, binary, block_dim_);
-  }
-  WriteToken(os, binary, "<TargetRms>");
-  WriteBasicType(os, binary, target_rms_);
-  WriteToken(os, binary, "<AddLogStddev>");
-  WriteBasicType(os, binary, add_log_stddev_);
-  WriteToken(os, binary, "</NormalizeComponent>");
-}
-
-std::string NormalizeComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
-         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
-  if (block_dim_ != input_dim_)
-    stream << ", block-dim=" << block_dim_;
-  return stream.str();
-}
-
-// The output y_i = scale * x_i,
-// and we want to RMS value of the y_i to equal target_rms,
-// so y^t y = D * target_rms^2 (if y is one row of the input).
-// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
-// there is also flooring involved, to avoid division-by-zero
-// problems.  It's important for the backprop, that the floor's
-// square root is exactly representable as float.
-// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
-// is an extra dimension of the output.
-void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() &&
-               in.NumRows() == out->NumRows());
-  if (block_dim_ != input_dim_) {
-    int32 num_blocks = input_dim_ / block_dim_,
-        new_num_rows = in.NumRows() * num_blocks,
-        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
-    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
-    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_num_rows,
-                                       block_dim_, block_dim_),
-        out_reshaped(out->Data(), new_num_rows,
-                     output_block_dim, output_block_dim);
-    cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_,
-                        &out_reshaped);
-  } else {
-    cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out);
-  }
-  return NULL;
-}
-
-/*
-  A note on the derivative of NormalizeComponent...
-  let both row_in and row_out be vectors of dimension D.
-  Let p = row_in^T row_in / (D * target_rms^2), and let
-  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
-  row_out = f row_in.
-  Suppose we have a quantity deriv_out which is the derivative
-  of the objective function w.r.t. row_out.  We want to compute
-  deriv_in which is the derivative of the objective function w.r.t.
-  row_in.  Let the objective function be F.  One term is obvious: we have
-  deriv_in = f deriv_out + ....
-  next we have to take into account the derivative that gets back-propagated
-  through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
-  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
-  So this term in dF/d(row_in) equals:
-  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
-  So
-  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
-
-  if add_log_stddev_ true, the deriv_in has another term as
-  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
-*/
-void NormalizeComponent::Backprop(const std::string &debug_info,
-                                  const ComponentPrecomputedIndexes *indexes,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &, // out_value
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  void *memo,
-                                  Component *to_update,
-                                  CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)
-    return;
-  if (block_dim_ != input_dim_) {
-    int32 num_blocks = input_dim_ / block_dim_,
-        new_num_rows = in_value.NumRows() * num_blocks,
-        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
-    KALDI_ASSERT(in_value.Stride() == in_value.NumCols() &&
-                 out_deriv.Stride() == out_deriv.NumCols() &&
-                 in_deriv->Stride() == in_deriv->NumCols());
-    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(), new_num_rows,
-                                             block_dim_, block_dim_),
-        out_deriv_reshaped(out_deriv.Data(), new_num_rows,
-                           output_block_dim, output_block_dim),
-        in_deriv_reshaped(in_deriv->Data(), new_num_rows,
-                          block_dim_, block_dim_);
-    cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_,
-                            add_log_stddev_, &in_deriv_reshaped);
-  } else {
-    cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
-                            in_deriv);
-  }
-}
-
 void* SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                  const CuMatrixBase<BaseFloat> &in,
                                  CuMatrixBase<BaseFloat> *out) const {
@@ -506,8 +331,10 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
   if (in_deriv != NULL) {
     in_deriv->DiffSigmoid(out_value, out_deriv);
     SigmoidComponent *to_update = dynamic_cast<SigmoidComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(out_value, in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -1015,8 +842,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
   if (in_deriv != NULL) {
     in_deriv->DiffTanh(out_value, out_deriv);
     TanhComponent *to_update = dynamic_cast<TanhComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(out_value, in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -1065,8 +894,10 @@ void RectifiedLinearComponent::Backprop(
     in_deriv->MulElements(out_deriv);
     RectifiedLinearComponent *to_update =
         dynamic_cast<RectifiedLinearComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -1200,13 +1031,15 @@ void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
 AffineComponent::AffineComponent(const AffineComponent &component):
     UpdatableComponent(component),
     linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_) { }
+    bias_params_(component.bias_params_),
+    orthonormal_constraint_(component.orthonormal_constraint_) { }
 
 AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  const CuVectorBase<BaseFloat> &bias_params,
                                  BaseFloat learning_rate):
     linear_params_(linear_params),
-    bias_params_(bias_params) {
+    bias_params_(bias_params),
+    orthonormal_constraint_(0.0) {
   SetUnderlyingLearningRate(learning_rate);
   KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
                bias_params.Dim() != 0);
@@ -1232,7 +1065,13 @@ void AffineComponent::PerturbParams(BaseFloat stddev) {
 std::string AffineComponent::Info() const {
   std::ostringstream stream;
   stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "linear-params", linear_params_);
+  if (orthonormal_constraint_ != 0.0)
+    stream << ", orthonormal-constraint=" << orthonormal_constraint_;
+  PrintParameterStats(stream, "linear-params", linear_params_,
+                      false, // include_mean
+                      true, // include_row_norms
+                      true, // include_column_norms
+                      GetVerboseLevel() >= 2); // include_singular_values
   PrintParameterStats(stream, "bias", bias_params_, true);
   return stream.str();
 }
@@ -1294,6 +1133,8 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
     Init(input_dim, output_dim,
          param_stddev, bias_stddev);
   }
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
@@ -1362,6 +1203,12 @@ void AffineComponent::Read(std::istream &is, bool binary) {
     ExpectToken(is, binary, "<IsGradient>");
     ReadBasicType(is, binary, &is_gradient_);
   }
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "</AffineComponent>");
 }
 
@@ -1371,6 +1218,10 @@ void AffineComponent::Write(std::ostream &os, bool binary) const {
   linear_params_.Write(os, binary);
   WriteToken(os, binary, "<BiasParams>");
   bias_params_.Write(os, binary);
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "</AffineComponent>");
 }
 
@@ -1688,7 +1539,7 @@ void NaturalGradientRepeatedAffineComponent::Update(
     try {
       // Only apply the preconditioning/natural-gradient if we're not computing
       // the exact gradient.
-      preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale);
+      preconditioner_in_.PreconditionDirections(&deriv, &scale);
     } catch (...) {
       int32 num_bad_rows = 0;
       for (int32 i = 0; i < out_deriv.NumRows(); i++) {
@@ -2103,12 +1954,6 @@ void PerElementScaleComponent::Backprop(
   PerElementScaleComponent *to_update =
       dynamic_cast<PerElementScaleComponent*>(to_update_in);
 
-  if (in_deriv) {
-    // Propagate the derivative back to the input.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->MulColsVec(scales_);
-  }
-
   if (to_update != NULL) {
     // Next update the model (must do this 2nd so the derivatives we propagate
     // are accurate, in case this == to_update_in.)
@@ -2117,6 +1962,13 @@ void PerElementScaleComponent::Backprop(
     else  // the call below is to a virtual function that may be re-implemented
       to_update->Update(debug_info, in_value, out_deriv);  // by child classes.
   }
+
+  if (in_deriv) {
+    // Propagate the derivative back to the input.
+    if (in_deriv->Data() != out_deriv.Data())
+      in_deriv->CopyFromMat(out_deriv);
+    in_deriv->MulColsVec(scales_);
+  }
 }
 
 void PerElementScaleComponent::Read(std::istream &is, bool binary) {
@@ -2303,7 +2155,7 @@ void PerElementOffsetComponent::Backprop(
       // this scenario)
       CuMatrix<BaseFloat> out_deriv_copy(out_deriv_reshaped);
       BaseFloat scale = 1.0;
-      to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, NULL,
+      to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
                                                         &scale);
       to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
                                        out_deriv_copy);
@@ -2588,7 +2440,7 @@ void ScaleAndOffsetComponent::BackpropInternal(
       BaseFloat scale = 1.0;
       CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
       to_update->offset_preconditioner_.PreconditionDirections(
-          &out_deriv_copy, NULL, &scale);
+          &out_deriv_copy, &scale);
       to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
                                        out_deriv_copy);
     }
@@ -2611,7 +2463,7 @@ void ScaleAndOffsetComponent::BackpropInternal(
     BaseFloat scale = 1.0;
     if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
       to_update->scale_preconditioner_.PreconditionDirections(
-          &in_value_reconstructed, NULL, &scale);
+          &in_value_reconstructed, &scale);
     }
     to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_,
                                     in_value_reconstructed);
@@ -2677,7 +2529,7 @@ void ConstantFunctionComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-                                                          NULL, &scale);
+                                                          &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
@@ -2820,16 +2672,35 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   linear_params_.Read(is, binary);
   ExpectToken(is, binary, "<BiasParams>");
   bias_params_.Read(is, binary);
+
+  BaseFloat num_samples_history, alpha;
+  int32 rank_in, rank_out, update_period;
+
   ExpectToken(is, binary, "<RankIn>");
-  ReadBasicType(is, binary, &rank_in_);
+  ReadBasicType(is, binary, &rank_in);
   ExpectToken(is, binary, "<RankOut>");
-  ReadBasicType(is, binary, &rank_out_);
+  ReadBasicType(is, binary, &rank_out);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "<UpdatePeriod>");
-  ReadBasicType(is, binary, &update_period_);
+  ReadBasicType(is, binary, &update_period);
   ExpectToken(is, binary, "<NumSamplesHistory>");
-  ReadBasicType(is, binary, &num_samples_history_);
+  ReadBasicType(is, binary, &num_samples_history);
   ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
+  ReadBasicType(is, binary, &alpha);
+
+  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+
   if (PeekToken(is, binary) == 'M') {
     // MaxChangePerSample, long ago removed; back compatibility.
     ExpectToken(is, binary, "<MaxChangePerSample>");
@@ -2858,7 +2729,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
     KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
               << "</NaturalGradientAffineComponent>, got " << token;
-  SetNaturalGradientConfigs();
 }
 
 
@@ -2868,30 +2738,21 @@ NaturalGradientAffineComponent::NaturalGradientAffineComponent(
     AffineComponent(linear_params, bias_params, 0.001) {
   KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() &&
                bias_params.Dim() != 0);
-  num_samples_history_ = 2000.0;
-  alpha_ = 4.0;
-  rank_in_ = 20;
-  rank_out_ = 80;
-  update_period_ = 4;
-  SetNaturalGradientConfigs();
+
+  // set some default natural gradient configs.
+  preconditioner_in_.SetRank(20);
+  preconditioner_out_.SetRank(80);
+  preconditioner_in_.SetUpdatePeriod(4);
+  preconditioner_out_.SetUpdatePeriod(4);
 }
 
 void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
   std::string matrix_filename;
-  num_samples_history_ = 2000.0;
-  alpha_ = 4.0;
-  rank_in_ = 20;
-  rank_out_ = 80;
-  update_period_ = 4;
+
   is_gradient_ = false;  // not configurable; there's no reason you'd want this
 
   InitLearningRatesFromConfig(cfl);
-  cfl->GetValue("num-samples-history", &num_samples_history_);
-  cfl->GetValue("alpha", &alpha_);
-  cfl->GetValue("rank-in", &rank_in_);
-  cfl->GetValue("rank-out", &rank_out_);
-  cfl->GetValue("update-period", &update_period_);
 
   if (cfl->GetValue("matrix", &matrix_filename)) {
     CuMatrix<BaseFloat> mat;
@@ -2930,23 +2791,34 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     bias_params_.Scale(bias_stddev);
     bias_params_.Add(bias_mean);
   }
+
+  orthonormal_constraint_ = 0.0;
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
+  // Set natural-gradient configs.
+  BaseFloat num_samples_history = 2000.0,
+      alpha = 4.0;
+  int32 rank_in = 20, rank_out = 80,
+      update_period = 4;
+  cfl->GetValue("num-samples-history", &num_samples_history);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank-in", &rank_in);
+  cfl->GetValue("rank-out", &rank_out);
+  cfl->GetValue("update-period", &update_period);
+
+  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-  SetNaturalGradientConfigs();
-}
-
-void NaturalGradientAffineComponent::SetNaturalGradientConfigs() {
-  preconditioner_in_.SetRank(rank_in_);
-  preconditioner_in_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_in_.SetAlpha(alpha_);
-  preconditioner_in_.SetUpdatePeriod(update_period_);
-  preconditioner_out_.SetRank(rank_out_);
-  preconditioner_out_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_out_.SetAlpha(alpha_);
-  preconditioner_out_.SetUpdatePeriod(update_period_);
 }
 
 void NaturalGradientAffineComponent::Write(std::ostream &os,
@@ -2957,28 +2829,30 @@ void NaturalGradientAffineComponent::Write(std::ostream &os,
   WriteToken(os, binary, "<BiasParams>");
   bias_params_.Write(os, binary);
   WriteToken(os, binary, "<RankIn>");
-  WriteBasicType(os, binary, rank_in_);
+  WriteBasicType(os, binary, preconditioner_in_.GetRank());
   WriteToken(os, binary, "<RankOut>");
-  WriteBasicType(os, binary, rank_out_);
+  WriteBasicType(os, binary, preconditioner_out_.GetRank());
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "<UpdatePeriod>");
-  WriteBasicType(os, binary, update_period_);
+  WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod());
   WriteToken(os, binary, "<NumSamplesHistory>");
-  WriteBasicType(os, binary, num_samples_history_);
+  WriteBasicType(os, binary, preconditioner_in_.GetNumSamplesHistory());
   WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
+  WriteBasicType(os, binary, preconditioner_in_.GetAlpha());
   WriteToken(os, binary, "</NaturalGradientAffineComponent>");
 }
 
 std::string NaturalGradientAffineComponent::Info() const {
   std::ostringstream stream;
-  stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "linear-params", linear_params_);
-  PrintParameterStats(stream, "bias", bias_params_, true);
-  stream << ", rank-in=" << rank_in_
-         << ", rank-out=" << rank_out_
-         << ", num-samples-history=" << num_samples_history_
-         << ", update-period=" << update_period_
-         << ", alpha=" << alpha_;
+  stream << AffineComponent::Info();
+  stream << ", rank-in=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
+         << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory()
+         << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
+         << ", alpha=" << preconditioner_in_.GetAlpha();
   return stream.str();
 }
 
@@ -2989,15 +2863,8 @@ Component* NaturalGradientAffineComponent::Copy() const {
 NaturalGradientAffineComponent::NaturalGradientAffineComponent(
     const NaturalGradientAffineComponent &other):
     AffineComponent(other),
-    rank_in_(other.rank_in_),
-    rank_out_(other.rank_out_),
-    update_period_(other.update_period_),
-    num_samples_history_(other.num_samples_history_),
-    alpha_(other.alpha_),
     preconditioner_in_(other.preconditioner_in_),
-    preconditioner_out_(other.preconditioner_out_) {
-  SetNaturalGradientConfigs();
-}
+    preconditioner_out_(other.preconditioner_out_) { }
 
 void NaturalGradientAffineComponent::Update(
     const std::string &debug_info,
@@ -3020,8 +2887,8 @@ void NaturalGradientAffineComponent::Update(
   // than having the matrices scaled inside the preconditioning code).
   BaseFloat in_scale, out_scale;
 
-  preconditioner_in_.PreconditionDirections(&in_value_temp, NULL, &in_scale);
-  preconditioner_out_.PreconditionDirections(&out_deriv_temp, NULL, &out_scale);
+  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
+  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
 
   // "scale" is a scaling factor coming from the PreconditionDirections calls
   // (it's faster to have them output a scaling factor than to have them scale
@@ -3075,6 +2942,12 @@ void LinearComponent::Read(std::istream &is, bool binary) {
   KALDI_ASSERT(token == "");
   ExpectToken(is, binary, "<Params>");
   params_.Read(is, binary);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "<UseNaturalGradient>");
   ReadBasicType(is, binary, &use_natural_gradient_);
 
@@ -3137,11 +3010,14 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   BaseFloat alpha = 4.0,
       num_samples_history = 2000.0;
 
+  use_natural_gradient_ = true;
+
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
   cfl->GetValue("rank-in", &rank_in);
   cfl->GetValue("rank-out", &rank_out);
   cfl->GetValue("update-period", &update_period);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
 
   preconditioner_in_.SetAlpha(alpha);
   preconditioner_out_.SetAlpha(alpha);
@@ -3152,6 +3028,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   preconditioner_in_.SetUpdatePeriod(update_period);
   preconditioner_out_.SetUpdatePeriod(update_period);
 
+  orthonormal_constraint_ = 0.0;
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
@@ -3163,6 +3042,10 @@ void LinearComponent::Write(std::ostream &os,
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<Params>");
   params_.Write(os, binary);
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "<UseNaturalGradient>");
   WriteBasicType(os, binary, use_natural_gradient_);
 
@@ -3186,11 +3069,17 @@ void LinearComponent::Write(std::ostream &os,
 std::string LinearComponent::Info() const {
   std::ostringstream stream;
   stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "params", params_);
+  PrintParameterStats(stream, "params", params_,
+                      false, // include_mean
+                      true, // include_row_norms
+                      true, // include_column_norms
+                      GetVerboseLevel() >= 2); // include_singular_values
+  if (orthonormal_constraint_ != 0.0)
+    stream << ", orthonormal-constraint=" << orthonormal_constraint_;
   stream << ", use-natural-gradient="
          << (use_natural_gradient_ ? "true" : "false")
          << ", rank-in=" << preconditioner_in_.GetRank()
-         << ", rank-out=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
          << ", num-samples-history="
          << preconditioner_in_.GetNumSamplesHistory()
          << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
@@ -3228,9 +3117,9 @@ void LinearComponent::Backprop(const std::string &debug_info,
       // than having the matrices scaled inside the preconditioning code).
       BaseFloat in_scale, out_scale;
       to_update->preconditioner_in_.PreconditionDirections(&in_value_temp,
-                                                           NULL, &in_scale);
+                                                           &in_scale);
       to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp,
-                                                            NULL, &out_scale);
+                                                            &out_scale);
       BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_;
 
       to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
@@ -3252,12 +3141,14 @@ LinearComponent::LinearComponent(
     const LinearComponent &other):
     UpdatableComponent(other),
     params_(other.params_),
+    orthonormal_constraint_(other.orthonormal_constraint_),
     use_natural_gradient_(other.use_natural_gradient_),
     preconditioner_in_(other.preconditioner_in_),
     preconditioner_out_(other.preconditioner_out_) { }
 
 LinearComponent::LinearComponent(const CuMatrix<BaseFloat> &params):
     params_(params),
+    orthonormal_constraint_(0.0),
     use_natural_gradient_(true) {
   // Set defaults for natural gradient.
   preconditioner_in_.SetRank(40);
@@ -3549,6 +3440,13 @@ void SoftmaxComponent::Backprop(const std::string &debug_info,
                                 void *memo,
                                 Component *to_update_in,
                                 CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (to_update_in) {
+    SoftmaxComponent *to_update =
+        dynamic_cast<SoftmaxComponent*>(to_update_in);
+    to_update->StoreBackpropStats(out_deriv);
+  }
+
   if (in_deriv == NULL)
     return;
   /*
@@ -3588,8 +3486,13 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info,
                                    const CuMatrixBase<BaseFloat> &out_value,
                                    const CuMatrixBase<BaseFloat> &out_deriv,
                                    void *memo,
-                                   Component *, // to_update
+                                   Component *to_update_in,
                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (to_update_in) {
+    LogSoftmaxComponent *to_update =
+        dynamic_cast<LogSoftmaxComponent*>(to_update_in);
+    to_update->StoreBackpropStats(out_deriv);
+  }
   if (in_deriv == NULL)
     return;
   in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);
@@ -3902,7 +3805,7 @@ void NaturalGradientPerElementScaleComponent::Update(
   // scales_.AddRowSumMat(learning_rate_, derivs_per_frame).
 
   BaseFloat scale;
-  preconditioner_.PreconditionDirections(&derivs_per_frame, NULL, &scale);
+  preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
 
   CuVector<BaseFloat> delta_scales(scales_.Dim());
   delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
@@ -5775,7 +5678,7 @@ void LstmNonlinearityComponent::Backprop(
     BaseFloat scale = 1.0;
     if (!to_update->is_gradient_) {
       to_update->preconditioner_.PreconditionDirections(
-          &params_deriv, NULL, &scale);
+          &params_deriv, &scale);
     }
     to_update->params_.AddMat(to_update->learning_rate_ * scale,
                               params_deriv);
@@ -5883,489 +5786,6 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   }
 }
 
-
-
-void BatchNormComponent::ComputeDerived() {
-  if (!test_mode_) {
-    offset_.Resize(0);
-    scale_.Resize(0);
-    return;
-  }
-
-  if (count_ == 0.0) {
-    KALDI_WARN << "Test-mode is set but there is no data count.  "
-        "Creating random counts.  This only makes sense "
-        "in unit-tests (or compute_prob_*.0.log).  If you see this "
-        "elsewhere, something is very wrong.";
-    count_ = 1.0;
-    stats_sum_.SetRandn();
-    stats_sumsq_.SetRandn();
-    stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
-  }
-
-  offset_.Resize(block_dim_);
-  scale_.Resize(block_dim_);
-  offset_.CopyFromVec(stats_sum_);
-  offset_.Scale(-1.0 / count_);
-  // now offset_ is -mean.
-  scale_.CopyFromVec(stats_sumsq_);
-  scale_.Scale(1.0 / count_);
-  scale_.AddVecVec(-1.0, offset_, offset_, 1.0);
-  // now scale_ is variance.
-  // Mathematically the ApplyFloor statement should be a no-op; this is in case
-  // of numerical roundoff.
-  scale_.ApplyFloor(0.0);
-  scale_.Add(epsilon_);
-  scale_.ApplyPow(-0.5);
-  // now scale_ = min(variance, epsilon)^{-0.5}.
-  // next, multiply by the target RMS (normally 1.0).
-  scale_.Scale(target_rms_);
-  offset_.MulElements(scale_);
-  // now offset_ is -(scale*mean).
-}
-
-void BatchNormComponent::SetTestMode(bool test_mode) {
-  test_mode_ = test_mode;
-  ComputeDerived();
-}
-
-void BatchNormComponent::Check() const {
-  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-               epsilon_ > 0.0 && target_rms_ > 0.0);
-}
-
-BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
-    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
-    target_rms_(other.target_rms_), test_mode_(other.test_mode_),
-    count_(other.count_), stats_sum_(other.stats_sum_),
-    stats_sumsq_(other.stats_sumsq_) {
-  ComputeDerived();
-  Check();
-}
-
-
-std::string BatchNormComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
-         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
-         << ", count=" << count_
-         << ", test-mode=" << (test_mode_ ? "true" : "false");
-  if (count_ > 0) {
-    Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
-    mean.Scale(1.0 / count_);
-    var.Scale(1.0 / count_);
-    // subtract mean^2 from var.
-    var.AddVecVec(-1.0, mean, mean, 1.0);
-    var.ApplyFloor(0.0);
-    var.ApplyPow(0.5);  // make it the stddev.
-    stream << ", data-mean=" << SummarizeVector(mean)
-           << ", data-stddev=" << SummarizeVector(var);
-  }
-  return stream.str();
-}
-
-void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
-  dim_ = -1;
-  block_dim_ = -1;
-  epsilon_ = 1.0e-03;
-  target_rms_ = 1.0;
-  test_mode_ = false;
-  bool ok = cfl->GetValue("dim", &dim_);
-  cfl->GetValue("block-dim", &block_dim_);
-  cfl->GetValue("epsilon", &epsilon_);
-  cfl->GetValue("target-rms", &target_rms_);
-  cfl->GetValue("test-mode", &test_mode_);
-  if (!ok || dim_ <= 0) {
-    KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0";
-  }
-  if (block_dim_ == -1)
-    block_dim_ = dim_;
-  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-        epsilon_ > 0 && target_rms_ > 0))
-    KALDI_ERR << "Invalid configuration in BatchNormComponent.";
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  count_ = 0;
-  stats_sum_.Resize(block_dim_);
-  stats_sumsq_.Resize(block_dim_);
-  if (test_mode_) {
-    ComputeDerived();
-  }
-}
-
-
-
-/*
-  BATCH_NORM_MATH
-
-  This comment describes the equations involved in batch normalization, and
-  derives the forward and back-propagation.
-
-  This is all dimension-by-dimension, so we just imagine the inputs
-  are scalars x(i), for i=0 .. n-1.
-
-  FORWARD PASS:
-
-  Define xsum  = sum_i x(i)
-         x2sum = sum_i x(i)^2
-          mean = xsum / n
-           var = x2sum / n - (mean*mean)
-         scale = (var + epsilon)^{-0.5}
-        offset = -mean * scale
-
-      y(i) = scale * x(i) + offset
-
-   Most of the rest of this comment derives how to compute the derivatives.  If
-   you just want the formulas, please skip to the string 'BACKWARD PASS' below.
-
-  We'll use a notation where an apostrophe on something means (the derivative of
-  the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
-  We are given y'(i).  Propagating the derivatives backward:
-     offset' = sum_i y'(i)
-     scale' = (sum_i y'(i) * x(i)) - offset' * mean
-       var' = scale' * -0.5 * (var + epsilon)^{-1.5}
-            = -0.5 * scale' * scale^3
-      mean' = -offset' * scale - 2 * mean * var'
-      xsum' = mean' / n
-     x2sum' = var' / n
-
-  So the derivatives propagated back to the original data are:
-     x'(i) = y'(i) * scale  +  xsum'  +  x(i) * x2sum'
-
-  The above is quite complicated to compute, but we can use some invariances
-  to work out a simpler way to compute the derivatives.
-
-  Firstly, note that x'(i) is of the form:
-
-   x'(i) =  y'(i) * scale + [affine function of x(i)].
-
-   [it's a 1-d affine function, i.e. offset and scale].
- This has the same functional form as:
-
-  x'(i) =  y'(i) * scale + [affine function of y(i)].
-
-  since y(i) is an affine function of x(i) with nonzero scale.
-  Because the output is invariant to shifts in the input, sum_i x'(i)
-  will be zero.  This is sufficient to determine the bias
-  term in the affine function.  [Note: the scale on y(i) doesn't
-  come into it because the y(i) sum to zero].  The offset
-  will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero.
-  So let's write it as
-
-    x'(i) =  (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i).
-
-  and it will be convenient to define:
-
-  x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-
-  which is just y'(i) with mean subtraction, scaled according to
-  the scale used in the normalization.  So write
-
-   x'(i) = x_deriv_base(i) + alpha y(i).
-
- The question is, what is the scale alpha.  We don't actually need to
- do any differentiation to figure this out.  First, assume there is
- no "+ epsilon" in the variance; later we'll explain why this doesn't
- matter.  The key to working out alpha is that the output is invariant
- to scaling of the input.  Assume we scale around the input's mean,
- since that makes the math simpler.  We can express this by the
- constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0.  This is
- equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since
- y(i) is x(i) - avg-x times a nonzero scale.  We'll use this contraint
- to determine alpha, Using the above expressionfor x(i), we can write
- this constraint as:
-   \sum_i ( y(i) x_deriv_base(i)  + alpha y(i) y(i)) = 0.
- Now, since we said we'd ignore the epsilon, the output has unit variance,
- so we know that \sum_i y(i) y(i) = n.
- So alpha = - \sum_i y(i) x_deriv_base(i) / n.  We can actually re-imagine
- the epsilon term (or variance-flooring) as having been implemented by
- adding a couple extra rows to the matrix with suitable values, and zero
- output-deriv for those rows.  If you think about it carefully you'll see that
- the formula above is valid even if there is an extra term
- in the variance.  Anyway the correctness of the derivative will get tested
- throughly by the component unit-tests.
-
- So to recap, here is the backprop.
-
- BACKWARD PASS:
-
-  We are given y'(i), scale, and y(i).
-
-  We compute:
-    x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-              alpha = - \sum_i y(i) x_deriv_base(i) / n
-              x'(i) = x_deriv_base(i) + alpha y(i)
-  */
-
-
-
-void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(SameDim(in, *out) &&
-               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
-  if (in.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
-    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
-        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
-        new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
-        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
-    return Propagate(indexes, in_reshaped, &out_reshaped);
-  }
-
-  // From this point, we can assume that the num-cols of 'in' and 'out'
-  // equals block_dim_.
-
-  if (!test_mode_) {
-    // search in the comment above for FORWARD PASS to see what is being
-    // implemented here.
-    // if this takes too much time due to multiple different CUDA calls,
-    // we'll consider making a single kernel for some of it.
-    Memo *memo = new Memo;
-    int32 num_frames = in.NumRows(), dim = block_dim_;
-    memo->num_frames = num_frames;
-    memo->mean_uvar_scale.Resize(4, dim);
-    CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
-        uvar(memo->mean_uvar_scale, 1),
-        scale(memo->mean_uvar_scale, 2);
-    mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
-    uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
-    scale.CopyFromVec(uvar);
-    // by applying this scale at this point, we save a multiply later on.
-    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
-    scale.AddVecVec(-var_scale, mean, mean, var_scale);
-    // at this point, 'scale' contains just the variance [divided by target-rms^2].
-    scale.ApplyFloor(0.0);
-    scale.Add(var_scale * epsilon_);
-    // Now 'scale' contains the variance floored to zero and then with epsilon
-    // added [both divided by target-rms^2].
-    scale.ApplyPow(-0.5);
-    // now 'scale' is the actual scale we'll use.
-
-    // the next command will do no work if out == in, for in-place propagation.
-    out->CopyFromMat(in);
-    out->AddVecToRows(-1.0, mean, 1.0);
-    out->MulColsVec(scale);
-    return static_cast<void*>(memo);
-  } else {
-    if (offset_.Dim() != block_dim_) {
-      if (count_ == 0)
-        KALDI_ERR << "Test mode set in BatchNormComponent, but no stats.";
-      else  // why was ComputeDerived() not called?
-        KALDI_ERR << "Code error in BatchNormComponent";
-    }
-    out->CopyFromMat(in);
-    out->MulColsVec(scale_);
-    out->AddVecToRows(1.0, offset_, 1.0);
-    return NULL;
-  }
-}
-
-void BatchNormComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,  // unused
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo_in,
-    Component *to_update,  // unused
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
-               SameDim(out_value, *in_deriv) &&
-               (out_value.NumCols() == dim_ ||
-                out_value.NumCols() == block_dim_));
-  if (out_value.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
-                 out_deriv.Stride() == out_deriv.NumCols() &&
-                 in_deriv->Stride() == in_deriv->NumCols());
-    int32 ratio = dim_ / block_dim_,
-        orig_rows = out_value.NumRows(),
-        orig_cols = out_value.NumCols(),
-        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
-                                              new_cols, new_cols),
-        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
-        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
-    // we'll never use in_value, so pass it in unchanged.
-    Backprop(debug_info, indexes, in_value,
-             out_value_reshaped, out_deriv_reshaped,
-             memo_in, to_update, &in_deriv_reshaped);
-    return;
-  }
-
-  Memo *memo = static_cast<Memo*>(memo_in);
-
-  if (!test_mode_) {
-    // search above for BACKWARD PASS for a comment describing the math.
-    KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
-    int32 num_frames = memo->num_frames;
-    KALDI_ASSERT(out_value.NumRows() == num_frames);
-    CuSubVector<BaseFloat> temp(memo->mean_uvar_scale, 3),
-        scale(memo->mean_uvar_scale, 2);
-    temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
-    // the following does no work if in_deriv and out_deriv are the same matrix.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->AddVecToRows(1.0, temp);
-    in_deriv->MulColsVec(scale);
-    // at this point, 'in_deriv' contains:
-    // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-    temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_),
-                       out_value, kTrans, *in_deriv, kNoTrans, 0.0);
-    // now, 'temp' contains the quantity which we described
-    // in the math as:
-    // alpha = - \sum_i y(i) x_deriv_base(i) / n.
-    // The factor 1 / (target_rms_ * target_rms_) comes from following
-    // this additional scaling factor through the math.  In the comment I said
-    // "we know that \sum_i y(i) y(i) = n".  Taking target-rms into account
-    // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2".
-    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0);
-    // At this point, in_deriv contains  x'(i) = x_deriv_base(i) + alpha y(i).
-
-  } else {
-    KALDI_ASSERT(offset_.Dim() == block_dim_);
-    // the next call does no work if they point to the same memory.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->MulColsVec(scale_);
-  }
-}
-
-void BatchNormComponent::StoreStats(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_value,
-    void *memo_in) {
-  // in test mode this component does not store stats, it doesn't provide the
-  // kStoresStats flag.
-  KALDI_ASSERT(!test_mode_);
-  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
-  if (out_value.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
-    int32 ratio = dim_ / block_dim_,
-        orig_rows = out_value.NumRows(),
-        orig_cols = out_value.NumCols(),
-        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
-                                              new_cols, new_cols);
-    // we'll never use in_value, so just pass it in unchanged.
-    StoreStats(in_value, out_value_reshaped, memo_in);
-    return;
-  }
-
-  Memo *memo = static_cast<Memo*>(memo_in);
-  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
-
-  CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
-      uvar(memo->mean_uvar_scale, 1);
-  KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0);
-  BaseFloat num_frames = memo->num_frames;
-  if (stats_sum_.Dim() != block_dim_) {
-    stats_sum_.Resize(block_dim_);
-    stats_sumsq_.Resize(block_dim_);
-    KALDI_ASSERT(count_ == 0);
-  }
-  count_ += num_frames;
-  stats_sum_.AddVec(num_frames, mean, 1.0);
-  stats_sumsq_.AddVec(num_frames, uvar, 1.0);
-}
-
-void BatchNormComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BatchNormComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<BlockDim>");
-  ReadBasicType(is, binary, &block_dim_);
-  ExpectToken(is, binary, "<Epsilon>");
-  ReadBasicType(is, binary, &epsilon_);
-  ExpectToken(is, binary, "<TargetRms>");
-  ReadBasicType(is, binary, &target_rms_);
-  ExpectToken(is, binary, "<TestMode>");
-  ReadBasicType(is, binary, &test_mode_);
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
-  ExpectToken(is, binary, "<StatsMean>");
-  stats_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<StatsVar>");
-  stats_sumsq_.Read(is, binary);
-  stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
-  stats_sum_.Scale(count_);
-  stats_sumsq_.Scale(count_);
-  ExpectToken(is, binary, "</BatchNormComponent>");
-  ComputeDerived();
-  Check();
-}
-
-void BatchNormComponent::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<BatchNormComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<BlockDim>");
-  WriteBasicType(os, binary, block_dim_);
-  WriteToken(os, binary, "<Epsilon>");
-  WriteBasicType(os, binary, epsilon_);
-  WriteToken(os, binary, "<TargetRms>");
-  WriteBasicType(os, binary, target_rms_);
-  WriteToken(os, binary, "<TestMode>");
-  WriteBasicType(os, binary, test_mode_);
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary,  count_);
-  CuVector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
-  if (count_ != 0) {
-    mean.Scale(1.0 / count_);
-    var.Scale(1.0 / count_);
-    var.AddVecVec(-1.0, mean, mean, 1.0);
-  }
-  WriteToken(os, binary, "<StatsMean>");
-  mean.Write(os, binary);
-  WriteToken(os, binary, "<StatsVar>");
-  var.Write(os, binary);
-  WriteToken(os, binary, "</BatchNormComponent>");
-}
-
-void BatchNormComponent::Scale(BaseFloat scale) {
-  if (scale == 0) {
-    count_ = 0.0;
-    stats_sum_.SetZero();
-    stats_sumsq_.SetZero();
-  } else {
-    count_ *= scale;
-    stats_sum_.Scale(scale);
-    stats_sumsq_.Scale(scale);
-  }
-}
-
-
-void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const BatchNormComponent *other =
-      dynamic_cast<const BatchNormComponent*>(&other_in);
-  count_ += alpha * other->count_;
-  stats_sum_.AddVec(alpha, other->stats_sum_);
-  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
-  // this operation might change offset_ and scale_, so we recompute them
-  // in this instance (but not in Scale()).
-  ComputeDerived();
-}
-
-void BatchNormComponent::ZeroStats() {
-  // We only zero the stats if we're not in test mode.  In test mode, this would
-  // be dangerous as the stats are the source for the transform, and zeroing
-  // them and then calling ComputeDerived() again would remove the transform
-  // parameters (offset_ and scale_).
-  if (!test_mode_) {
-    count_ = 0.0;
-    stats_sum_.SetZero();
-    stats_sumsq_.SetZero();
-  }
-}
-
-
 SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
     input_dim_(other.input_dim_), output_dim_(other.output_dim_),
     scale_(other.scale_) { }
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index d7cece06284..b1eb30a55bf 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -40,6 +40,9 @@ namespace nnet3 {
 ///   output for one input, and return the kSimpleComponent flag in their
 ///   Properties(): for example, tanh and affine components.  In
 ///   nnet-general-component.h there are components that don't fit this pattern.
+///
+///   Some components that do provide the kSimpleComponent flag are not declared
+///   here: see also nnet-normalize-component.h.
 
 // This "nnet3" version of the p-norm component only supports the 2-norm.
 class PnormComponent: public Component {
@@ -186,82 +189,6 @@ class ElementwiseProductComponent: public Component {
   int32 output_dim_;
 };
 
-/*
-   Implements the function:
-
-         y = x * (sqrt(dim(x)) * target-rms) / |x|
-
-    where |x| is the 2-norm of the vector x.  I.e. its output is its input
-    scaled such that the root-mean-square values of its elements equals
-    target-rms.  (As a special case, if the input is zero, it outputs zero).
-
-    Note: if you specify add-log-stddev=true, it adds an extra element to
-     y which equals log(|x| / sqrt(dim(x))).
-
-
-   Configuration values accepted:
-      dim, or input-dim    Input dimension of this component, e.g. 1024.
-                           Will be the same as the output dimension if add-log-stddev=false.
-      block-dim            Defaults to 'dim' you may specify a nonzero divisor
-                           of 'dim'.  In this case the input dimension will
-                           be interpreted as blocks of dimension 'block-dim'
-                           to which the nonlinearity described above is applied
-                           separately.
-      add-log-stddev       You can set this to true to add an extra output
-                           dimension which will equal |x| / sqrt(dim(x)).
-                           If block-dim is specified, this is done per block.
-      target-rms           This defaults to 1.0, but if set it to another
-                           (nonzero) value, the output will be scaled by this
-                           factor.
- */
-class NormalizeComponent: public Component {
- public:
-  explicit NormalizeComponent(const NormalizeComponent &other);
-
-  virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds|
-        (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) |
-        (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0);
-  }
-  NormalizeComponent() { }
-  virtual std::string Type() const { return "NormalizeComponent"; }
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual Component* Copy() const { return new NormalizeComponent(*this); }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                          const CuMatrixBase<BaseFloat> &in,
-                          CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const {
-    return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0));
-  }
-  virtual std::string Info() const;
- private:
-  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  enum { kExpSquaredNormFloor = -66 };
-  // kSquaredNormFloor is about 0.7e-20.  We need a value that's exactly representable in
-  // float and whose inverse square root is also exactly representable
-  // in float (hence, an even power of two).
-  static const BaseFloat kSquaredNormFloor;
-  int32 input_dim_;
-  int32 block_dim_;
-  BaseFloat target_rms_; // The target rms for outputs, default 1.0.
-
-  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
-                        // is an extra dimension of the output.
-};
-
-
 /*
    Implements the sigmoid nonlinearity, i.e. the function y = exp(-x).
 
@@ -463,10 +390,11 @@ class AffineComponent: public UpdatableComponent {
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 
+  BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
   virtual std::string Info() const;
   virtual void InitFromConfig(ConfigLine *cfl);
 
-  AffineComponent() { } // use Init to really initialize.
+  AffineComponent(): orthonormal_constraint_(0.0) { } // use Init to really initialize.
   virtual std::string Type() const { return "AffineComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|
@@ -507,6 +435,7 @@ class AffineComponent: public UpdatableComponent {
                          const CuMatrixBase<BaseFloat> &linear);
   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
+  CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
   explicit AffineComponent(const AffineComponent &other);
   // The next constructor is used in converting from nnet1.
   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
@@ -539,6 +468,7 @@ class AffineComponent: public UpdatableComponent {
   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
   CuMatrix<BaseFloat> linear_params_;
   CuVector<BaseFloat> bias_params_;
+  BaseFloat orthonormal_constraint_;
 };
 
 class RepeatedAffineComponent;
@@ -828,6 +758,19 @@ class LogSoftmaxComponent: public NonlinearComponent {
                            Dimension is output-dim by (input-dim + 1), last
                            column is interpreted as the bias.
 
+   Other options:
+    orthonormal-constraint=0.0   If you set this to 1.0, then
+                           the linear_params_ matrix will be (approximately)
+                           constrained during training to have orthonormal rows
+                           (or columns, whichever is fewer).  You can choose a
+                           positive nonzero value different than 1.0 to have a
+                           scaled orthonormal matrix, i.e. with singular values
+                           at the selected value (e.g. 0.5, or 2.0).  This is
+                           not enforced inside the component itself; you have to
+                           call ConstrainOrthonormal() from the training code to
+                           do this.  All this component does is return the
+                           OrthonormalConstraint() value.
+
    Options to the natural gradient (you won't normally have to set these,
    the defaults are suitable):
 
@@ -871,22 +814,10 @@ class NaturalGradientAffineComponent: public AffineComponent {
   NaturalGradientAffineComponent &operator= (
       const NaturalGradientAffineComponent&);
 
-  // Configs for preconditioner.  The input side tends to be better conditioned ->
-  // smaller rank needed, so make them separately configurable.
-  int32 rank_in_;
-  int32 rank_out_;
-  int32 update_period_;
-  BaseFloat num_samples_history_;
-  BaseFloat alpha_;
-
   OnlineNaturalGradient preconditioner_in_;
 
   OnlineNaturalGradient preconditioner_out_;
 
-  // Sets the configs rank, alpha and eta in the preconditioner objects,
-  // from the class variables.
-  void SetNaturalGradientConfigs();
-
   virtual void Update(
       const std::string &debug_info,
       const CuMatrixBase<BaseFloat> &in_value,
@@ -919,6 +850,16 @@ class NaturalGradientAffineComponent: public AffineComponent {
                            bias-stddev, bias-mean) to initialize the parameters.
                            Dimension is output-dim by (input-dim + 1), last
                            column is interpreted as the bias.
+    orthonormal-constraint=0.0   If you set this to 1.0, then
+                           this matrix will be (approximately) constrained during
+                           training to have orthonormal rows (or columns, whichever
+                           is fewer).  You can choose a positive nonzero value different
+                           than 1.0 to have a scaled orthonormal matrix, i.e. with singular
+                           values at the selected value (e.g. 0.5, or 2.0).
+                           This is not enforced inside the component
+                           itself; you have to call ConstrainOrthonormal()
+                           from the training code to do this.  All this component
+                           does is return the OrthonormalConstraint() value.
 
    Options to the natural gradient (you won't normally have to set these,
    the defaults are suitable):
@@ -982,14 +923,19 @@ class LinearComponent: public UpdatableComponent {
   explicit LinearComponent(const LinearComponent &other);
 
   explicit LinearComponent(const CuMatrix<BaseFloat> &params);
+
+  BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
+  CuMatrixBase<BaseFloat> &Params() { return params_; }
+  const CuMatrixBase<BaseFloat> &Params() const { return params_; }
  private:
 
   // disallow assignment operator.
   LinearComponent &operator= (
       const LinearComponent&);
 
-
   CuMatrix<BaseFloat> params_;
+
+  BaseFloat orthonormal_constraint_;
   // If true (and if no this->is_gradient_), use natural gradient updates.
   bool use_natural_gradient_;
   OnlineNaturalGradient preconditioner_in_;
@@ -1460,8 +1406,12 @@ class PermuteComponent: public Component {
 
 
 
-// PerElementScaleComponent scales each dimension of its input with a separate
-// trainable scale; it's like a linear component with a diagonal matrix.
+/**
+   PerElementScaleComponent scales each dimension of its input with a separate
+   trainable scale; it's like a linear component with a diagonal matrix.  This
+   version (and its child class NaturalGradientPerElementScaleComponent)
+   requires the input for backprop.  See also ScaleAndOffsetComponent.
+*/
 class PerElementScaleComponent: public UpdatableComponent {
  public:
   virtual int32 InputDim() const { return scales_.Dim(); }
@@ -1474,7 +1424,7 @@ class PerElementScaleComponent: public UpdatableComponent {
   virtual std::string Type() const { return "PerElementScaleComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-        kPropagateInPlace;
+        kPropagateInPlace|kBackpropInPlace;
   }
 
   virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -1686,8 +1636,7 @@ class ConstantFunctionComponent: public UpdatableComponent {
 
 
 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
-// it uses a natural gradient update for the per-element scales, and enforces a
-// maximum amount of change per minibatch, for stability.
+// it uses a natural gradient update for the per-element scales.
 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public:
 
@@ -2384,183 +2333,6 @@ class MaxpoolingComponent: public Component {
 };
 
 
-/*
-  BatchNormComponent
-
-  This implements batch normalization; for each dimension of the
-  input it normalizes the data to be zero-mean, unit-variance.  You
-  can set the block-dim configuration value to implement spatial
-  batch normalization, see the comment for the variable.
-
-  If you want to combine this with the trainable offset and scale that the
-  original BatchNorm paper used, then follow this by the
-  ScaleAndOffsetComponent.
-
-  It's a simple component (uses the kSimpleComponent flag), but it is unusual in
-  that it will give different results if you call it on half the matrix at a
-  time.  Most of the time this would be pretty harmless, so we still return the
-  kSimpleComponent flag.  We may have to modify the test code a little to
-  account for this, or possibly remove the kSimpleComponent flag.  In some sense
-  each output Index depends on every input Index, but putting those dependencies
-  explicitly into the dependency-tracking framework as a GeneralComponent
-  would be very impractical and might lead to a lot of unnecessary things being
-  computed.  You have to be a bit careful where you put this component, and understand
-  what you're doing e.g. putting it in the path of a recurrence is a bit problematic
-  if the minibatch size is small.
-
-    Accepted configuration values:
-           dim          Dimension of the input and output
-           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
-                        of 'dim'.  In this case, each block of dimension 'block-dim'
-                        is treated like a separate row of the input matrix, which
-                        means that the stats from n'th element of each
-                        block are pooled into one class, for each n.a
-           epsilon      Small term added to the variance that is used to prevent
-                        division by zero
-           target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
-                        it will normalize the standard deviation of the output to
-                        2.0. 'target-stddev' might be a more suitable name, but this
-                        was chosen for consistency with NormalizeComponent.
- */
-class BatchNormComponent: public Component {
- public:
-
-  BatchNormComponent() { }
-
-  // call this with 'true' to set 'test mode' where the batch normalization is
-  // done with stored stats.  There won't normally be any need to specially
-  // accumulate these stats; they are stored as a matter of course on each
-  // iteration of training, as for NonlinearComponents, and we'll use the stats
-  // from the most recent [script-level] iteration.
-  void SetTestMode(bool test_mode);
-
-  // constructor using another component
-  BatchNormComponent(const BatchNormComponent &other);
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "BatchNormComponent"; }
-  virtual int32 Properties() const {
-    // If the block-dim is less than the dim, we need the input and output
-    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
-    // internally.  This is not much of a cost, because this will be used
-    // in convnets where we have to do this anyway.
-    return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|
-        kBackpropInPlace|
-        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
-        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
-  }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *, // to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const { return new BatchNormComponent(*this); }
-
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void ZeroStats();
-
-
-  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
-
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
-                          const CuMatrixBase<BaseFloat> &out_value,
-                          void *memo);
-
-  // Members specific to this component type.
-  // Note: the offset and scale will only be nonempty in 'test mode'.
-  const CuVector<BaseFloat> &Offset() const { return offset_; }
-  const CuVector<BaseFloat> &Scale() const { return scale_; }
-
- private:
-
-  struct Memo {
-    // number of frames (after any reshaping).
-    int32 num_frames;
-    // 'sum_sumsq_scale' is of dimension 4 by block_dim_:
-    // Row 0 = mean = the mean of the rows of the input
-    // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
-    // Row 2 = scale = the scale of the renormalization, which is
-    // Row 3 is used as a temporary in Backprop.
-    //    the inverse stddev of the input (modified by epsilon_,
-    //    see the Propagate function.
-    CuMatrix<BaseFloat> mean_uvar_scale;
-  };
-
-  void Check() const;
-
-  // this function is used in a couple of places; it turns the raw stats into
-  // the offset/scale term of a normalizing transform.
-  static void ComputeOffsetAndScale(double count,
-                                    BaseFloat epsilon,
-                                    const Vector<double> &stats_sum,
-                                    const Vector<double> &stats_sumsq,
-                                    Vector<BaseFloat> *offset,
-                                    Vector<BaseFloat> *scale);
-  // computes derived parameters offset_ and scale_.
-  void ComputeDerived();
-
-  // Dimension of the input and output.
-  int32 dim_;
-  // This would normally be the same as dim_, but if it's less (and it must be >
-  // 0 and must divide dim_), then each separate block of the input of dimension
-  // 'block_dim_' is treated like a separate frame for the purposes of
-  // normalization.  This can be used to implement spatial batch normalization
-  // for convolutional setups-- assuming the filter-dim has stride 1, which it
-  // always will in the new code in nnet-convolutional-component.h.
-  int32 block_dim_;
-
-  // Used to avoid exact-zero variances, epsilon has the dimension of a
-  // covariance.
-  BaseFloat epsilon_;
-
-  // This value will normally be 1.0, which is the default, but you can set it
-  // to other values as a way to control how fast the following layer learns
-  // (smaller -> slower).  The same config exists in NormalizeComponent.
-  BaseFloat target_rms_;
-
-  // This is true if we want the batch normalization to operate in 'test mode'
-  // meaning the data mean and stddev used for the normalization are fixed
-  // quantities based on previously accumulated stats.  Note: the stats we use
-  // for this are based on the same 'StoreStats' mechanism as we use for
-  // components like SigmoidComponent and ReluComponent; we'll be using
-  // the stats from the most recent [script-level] iteration of training.
-  bool test_mode_;
-
-
-  // total count of stats stored by StoreStats().
-  double count_;
-  // sum-of-data component of stats of input data.
-  CuVector<double> stats_sum_;
-  // sum-of-squared component of stats of input data.
-  CuVector<double> stats_sumsq_;
-
-  // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they
-  // dictate the transform that is done in 'test mode'.  They are set only when
-  // reading the model from disk and when calling SetTestMode(true); they are
-  // resized to empty when the stats are updated, to ensure that out-of-date
-  // values are not kept around.
-  CuVector<BaseFloat> offset_;
-  CuVector<BaseFloat> scale_;
-};
-
-
 /**
    CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 83b902a9b90..48a97df9ea1 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1675,11 +1675,11 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     // labels to the most recently added component, so it gets tested more
     case 31: {
       *component_type = "BatchNormComponent";
-      int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2);
+      int32 block_dim = RandInt(1, 20), dim = block_dim * RandInt(1, 2);
       bool test_mode = (RandInt(0, 1) == 0);
       os << " dim=" << dim
          << " block-dim=" << block_dim << " target-rms="
-         << RandInt(1, 2) << " test-mode="
+         << RandInt(1, 4) << " test-mode="
          << (test_mode ? "true" : "false")
          << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
       break;
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index a9093523222..6bff30c501b 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -88,8 +88,11 @@ void NnetTrainer::Train(const NnetExample &eg) {
 
 void NnetTrainer::TrainInternal(const NnetExample &eg,
                                 const NnetComputation &computation) {
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(config_.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
   computer.Run();
@@ -112,6 +115,10 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
   // happens when we use the model with batchnorm test-mode set).
   ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_);
 
+  // The following will only do something if we have a LinearComponent
+  // or AffineComponent with orthonormal-constraint set to a nonzero value.
+  ConstrainOrthonormal(nnet_);
+
   // Scale deta_nnet
   if (success)
     ScaleNnet(config_.momentum, delta_nnet_);
@@ -122,8 +129,11 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
 void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
                                           const NnetComputation &computation,
                                           bool is_backstitch_step1) {
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(config_.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
   computer.Run();
@@ -159,6 +169,21 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
       max_change_scale, scale_adding, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
+  if (is_backstitch_step1) {
+    // The following will only do something if we have a LinearComponent or
+    // AffineComponent with orthonormal-constraint set to a nonzero value. We
+    // choose to do this only on the 1st backstitch step, for efficiency.
+    ConstrainOrthonormal(nnet_);
+  }
+
+  if (!is_backstitch_step1) {
+    // Scale down the batchnorm stats (keeps them fresh... this affects what
+    // happens when we use the model with batchnorm test-mode set).  Do this
+    // after backstitch step 2 so that the stats are scaled down before we start
+    // the next minibatch.
+    ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_);
+  }
+
   ScaleNnet(0.0, delta_nnet_);
 }
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 64fc3003609..fd2229cace8 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -22,6 +22,7 @@
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-graph.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
 #include "nnet3/nnet-parse.h"
@@ -491,9 +492,7 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
 bool HasBatchnorm(const Nnet &nnet) {
   for (int32 c = 0; c < nnet.NumComponents(); c++) {
     const Component *comp = nnet.GetComponent(c);
-    const BatchNormComponent *bc =
-        dynamic_cast<const BatchNormComponent*>(comp);
-    if (bc != NULL)
+    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL)
       return true;
   }
   return false;
@@ -859,6 +858,105 @@ class SvdApplier {
   std::string component_name_pattern_;
 };
 
+// Does an update that moves M closer to being a (matrix with
+// orthonormal rows) times 'scale'.  Note: this will diverge if
+// we start off with singular values too far from 'scale'.
+void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
+  // Larger alpha will update faster but will be more prone to instability.  I
+  // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or
+  // it will always be unstable.  It should be > 0.0.
+
+  // The factor of 1/scale^2 is, I *believe*, going to give us the right kind of
+  // invariance w.r.t. the scale.  To explain why this is the appropriate
+  // factor, look at the statement M_update.AddMatMat(-4.0 * alpha, P, kNoTrans,
+  // *M, kNoTrans, 0.0); where P is proportional to scale^2 and M to 'scale' and
+  // alpha to 1/scale^2, so change in M_update is proportional to 'scale'.
+  // We'd like 'M_update' to be proportional to 'scale'. This reasoning is very
+  // hand-wavey but I think it can be made rigorous.  This is about remaining
+  // stable (not prone to divergence) even for very large or small values of
+  // 'scale'.
+  BaseFloat alpha = 0.125 / (scale * scale);
+
+  // We'd like to enforce the rows of M to be orthonormal.
+  // define P = M M^T.  If P is unit then M has orthonormal rows.
+  // We actually want P to equal scale^2 * I, so that M's rows are
+  // orthogonal with 2-norms equal to 'scale'.
+  // We (notionally) add to the objective function, the value
+  // -alpha times the sum of squared elements of Q = (P - scale^2 * I).
+  int32 rows = M->NumRows(), cols = M->NumCols();
+  CuMatrix<BaseFloat> M_update(rows, cols);
+  CuMatrix<BaseFloat> P(rows, rows);
+  P.SymAddMat2(1.0, *M, kNoTrans, 0.0);
+  P.CopyLowerToUpper();
+  P.AddToDiag(-1.0 * scale * scale);
+
+  if (GetVerboseLevel() >= 1) {
+    BaseFloat error = P.FrobeniusNorm();
+    KALDI_VLOG(2) << "Error in orthogonality is " << error;
+  }
+
+  // At this point, the matrix P contains what, in the math, would be Q =
+  // P-scale^2*I.  The derivative of the objective function w.r.t. an element q(i,j)
+  // of Q is now equal to -2*alpha*q(i,j), i.e. we could write q_deriv(i,j)
+  // = -2*alpha*q(i,j) This is also the derivative of the objective function
+  // w.r.t. p(i,j): i.e. p_deriv(i,j) = -2*alpha*q(i,j).
+  // Suppose we have define this matrix as 'P_deriv'.
+  // The derivative of the objective w.r.t M equals
+  // 2 * P_deriv * M, which equals -4*alpha*(P-scale^2*I)*M.
+  // (Currently the matrix P contains what, in the math, is P-scale^2*I).
+  M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0);
+  M->AddMat(1.0, M_update);
+}
+
+/**
+   This function, to be called after processing every minibatch, is responsible
+   for enforcing the orthogonality constraint for any components of type
+   LinearComponent or inheriting from AffineComponent that have the
+   "orthonormal_constraint" value set.
+ */
+void ConstrainOrthonormal(Nnet *nnet) {
+
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *component = nnet->GetComponent(c);
+    LinearComponent *lc = dynamic_cast<LinearComponent*>(component);
+    if (lc != NULL && lc->OrthonormalConstraint() != 0.0) {
+      if (RandInt(0, 3) != 0)
+        continue;  // For efficiency, only do this every 4 minibatches-- it won't
+                   // stray far.
+      BaseFloat scale = lc->OrthonormalConstraint();
+      KALDI_ASSERT(scale > 0.0);
+
+      CuMatrixBase<BaseFloat> &params = lc->Params();
+      int32 rows = params.NumRows(), cols = params.NumCols();
+      if (rows <= cols) {
+        ConstrainOrthonormalInternal(scale, &params);
+      } else {
+        CuMatrix<BaseFloat> params_trans(params, kTrans);
+        ConstrainOrthonormalInternal(scale, &params_trans);
+        params.CopyFromMat(params_trans, kTrans);
+      }
+    }
+
+    AffineComponent *ac = dynamic_cast<AffineComponent*>(component);
+    if (ac != NULL && ac->OrthonormalConstraint() != 0.0) {
+      if (RandInt(0, 3) != 0)
+        continue;  // For efficiency, only do this every 4 minibatches-- it won't
+                   // stray far.
+      BaseFloat scale = ac->OrthonormalConstraint();
+      KALDI_ASSERT(scale > 0.0);
+      CuMatrixBase<BaseFloat> &params = ac->LinearParams();
+      int32 rows = params.NumRows(), cols = params.NumCols();
+      if (rows <= cols) {
+        ConstrainOrthonormalInternal(scale, &params);
+      } else {
+        CuMatrix<BaseFloat> params_trans(params, kTrans);
+        ConstrainOrthonormalInternal(scale, &params_trans);
+        params.CopyFromMat(params_trans, kTrans);
+      }
+    }
+  }
+}
+
 
 // This code has been broken out of ReadEditConfig as it's quite long.
 // It implements the internals of the edit directive 'reduce-rank'.
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index d961b7cb6a0..efa36e1f64c 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -168,8 +168,7 @@ std::string NnetInfo(const Nnet &nnet);
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 
-/// Returns true if nnet has at least one component of type
-/// BatchNormComponent.
+/// Returns true if nnet has at least one component of type BatchNormComponent.
 bool HasBatchnorm(const Nnet &nnet);
 
 /// This function affects only components of type BatchNormComponent.
@@ -251,7 +250,6 @@ struct CollapseModelConfig {
 void CollapseModel(const CollapseModelConfig &config,
                    Nnet *nnet);
 
-
 /**
    ReadEditConfig() reads a file with a similar-looking format to the config file
    read by Nnet::ReadConfig(), but this consists of a sequence of operations to
@@ -452,6 +450,18 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
                          Nnet *nnet);
 
 
+/**
+   This function, to be called after processing every minibatch, is responsible
+   for enforcing the orthogonality constraint for any components of type
+   LinearComponent or inheriting from AffineComponent that have the
+   "orthonormal-constraint" value set to nonzero.
+
+   In order to make it efficient on GPU, it doesn't make it completely orthonormal,
+   it just makes it closer to being orthonormal (times the 'orthonormal_constraint'
+   value).  Over multiple iterations this rapidly makes it almost exactly orthonormal.
+ */
+void ConstrainOrthonormal(Nnet *nnet);
+
 /** This utility function can be used to obtain the number of distinct 'n'
     values in a training example.  This is the number of examples
     (e.g. sequences) that have been combined into a single example.  (Actually
diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc
index 7e937f0c211..25a65dbed5c 100644
--- a/src/nnet3bin/nnet3-show-progress.cc
+++ b/src/nnet3bin/nnet3-show-progress.cc
@@ -132,6 +132,10 @@ int main(int argc, char *argv[]) {
     { // Get info about magnitude of parameter change.
       Nnet diff_nnet(nnet1);
       AddNnet(nnet2, -1.0, &diff_nnet);
+      if (GetVerboseLevel() >= 1) {
+        KALDI_VLOG(1) << "Printing info for the difference between the neural nets: "
+                      << diff_nnet.Info();
+      }
       int32 num_updatable = NumUpdatableComponents(diff_nnet);
       Vector<BaseFloat> dot_prod(num_updatable);
       ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
@@ -139,12 +143,15 @@ int main(int argc, char *argv[]) {
       KALDI_LOG << "Parameter differences per layer are "
                 << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
 
-      Vector<BaseFloat> baseline_prod(num_updatable);
+      Vector<BaseFloat> baseline_prod(num_updatable),
+          new_prod(num_updatable);
       ComponentDotProducts(nnet1, nnet1, &baseline_prod);
+      ComponentDotProducts(nnet2, nnet2, &new_prod);
       baseline_prod.ApplyPow(0.5);
+      new_prod.ApplyPow(0.5);
 
-      KALDI_LOG << "Norms of parameter matrices are "
-                << PrintVectorPerUpdatableComponent(nnet1, baseline_prod);
+      KALDI_LOG << "Norms of parameter matrices from <new-nnet-in> are "
+                << PrintVectorPerUpdatableComponent(nnet2, new_prod);
 
       dot_prod.DivElements(baseline_prod);
       KALDI_LOG << "Relative parameter differences per layer are "
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index f490f490f61..0e45fe665b5 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -77,12 +77,11 @@ void RnnlmEmbeddingTrainer::Train(
     if (l2_term != 0.0) {
       embedding_deriv->AddMat(l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
-    preconditioner_.PreconditionDirections(embedding_deriv, NULL,
-                                           &scale);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
   }
   scale *= config_.learning_rate;
   num_minibatches_++;
@@ -130,11 +129,10 @@ void RnnlmEmbeddingTrainer::Train(
     if (l2_term != 0.0) {
       embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
-    preconditioner_.PreconditionDirections(embedding_deriv, NULL,
-                                           &scale);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
   }
   scale *= config_.learning_rate;
   num_minibatches_++;