diff --git a/egs/cifar/v1/local/nnet3/compare.sh b/egs/cifar/v1/local/nnet3/compare.sh
new file mode 100755
index 00000000000..c5208c38ac0
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/compare.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# this script is used for comparing trained models between systems.
+# e.g. local/nnet3/compare.sh exp/resnet1{b,c}_cifar10
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/resnet1{b,c}_cifar10"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 12s" " $(basename $x)";   done
+echo
+
+
+echo -n "# final test accuracy: "
+for x in $*; do
+  acc=$(grep acc $x/log/compute_prob_valid.final.log | awk '{print $8}')
+  printf "% 12s" $acc
+done
+
+echo
+echo -n "# final train accuracy: "
+for x in $*; do
+  acc=$(grep acc $x/log/compute_prob_train.final.log | awk '{print $8}')
+  printf "% 12s" $acc
+done
+
+echo
+echo -n "# final test objf:      "
+for x in $*; do
+  objf=$(grep log-like $x/log/compute_prob_valid.final.log | awk '{print $8}')
+  printf "% 12s" $objf
+done
+
+echo
+echo -n "# final train objf:     "
+for x in $*; do
+  objf=$(grep log-like $x/log/compute_prob_train.final.log | awk '{print $8}')
+  printf "% 12s" $objf
+done
+
+echo
+echo -n "# num-parameters:      "
+for x in $*; do
+  params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+  printf "% 12s" $params
+done
+
+echo
diff --git a/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh b/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh
index f31ad7601a9..8e5f83ea2d5 100755
--- a/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh
+++ b/egs/cifar/v1/local/nnet3/run_cnn_aug_1b.sh
@@ -1,10 +1,14 @@
 #!/bin/bash
 
-# aug_1b is the same as 1e but with data augmentation
-# accuracy 84.5% (1e has accuracy 83%)
+# run_cnn_aug_1b is the same as run_cnn_1e but with data augmentation.
+
+# accuracy is 0.857, vs. 0.83 for the un-augmented baseline.
+
+# exp/cnn_aug_1b_cifar10: num-iters=60 nj=1..2 num-params=2.2M dim=96->10 combine=-0.40->-0.38 loglike:train/valid[39,59,final]=(-0.35,-0.26,-0.26/-0.47,-0.42,-0.42) accuracy:train/valid[39,59,final]=(0.88,0.91,0.91/0.84,0.86,0.86)
+
+# grep Overall exp/cnn_aug_1b_cifar10/log/compute_prob_valid.final.log  | grep acc
+# LOG (nnet3-compute-prob[5.1]:PrintTotalStats():nnet-diagnostics.cc:165) Overall accuracy for 'output' is 0.8567 per frame, over 10000 frames.#
 
-# steps/info/nnet3_dir_info.pl exp/cnn_aug_1b_cifar10
-# exp/cnn_aug_1b_cifar10/: num-iters=60 nj=1..2 num-params=0.2M dim=96->10 combine=-0.53->-0.50 loglike:train/valid[39,59,final]=(-0.57,-0.45,-0.48/-0.68,-0.62,-0.64) accuracy:train/valid[39,59,final]=(0.80,0.84,0.83/0.76,0.79,0.78)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
@@ -17,7 +21,7 @@ train_stage=-10
 dataset=cifar10
 srand=0
 reporting_email=
-affix=_aug_1e
+affix=_aug_1b
 
 
 # End configuration section.
@@ -93,7 +97,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet3/train_raw_dnn.py --stage=$train_stage \
     --cmd="$train_cmd" \
-    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1" \
+    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=30 \
diff --git a/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh b/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh
index 23c801290a3..184ea0fa306 100755
--- a/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh
+++ b/egs/cifar/v1/local/nnet3/run_cnn_aug_1c.sh
@@ -5,7 +5,7 @@
 # accuracy improved from 85.8% to 88%
 
 # steps/info/nnet3_dir_info.pl exp/cnn_aug_1c_cifar10/
-# exp/cnn_aug_1c_cifar10/: num-iters=200 nj=1..2 num-params=2.2M dim=96->10 combine=-0.24->-0.24 loglike:train/valid[132,199,final]=(-0.18,-0.12,-0.12/-0.39,-0.37,-0.37) accuracy:train/valid[132,199,final]=(0.94,0.96,0.96/0.87,0.88,0.88)
+# exp/cnn_aug_1c_cifar10: num-iters=200 nj=1..2 num-params=2.2M dim=96->10 combine=-0.23->-0.24 loglike:train/valid[132,199,final]=(-0.17,-0.12,-0.12/-0.39,-0.36,-0.37) accuracy:train/valid[132,199,final]=(0.94,0.96,0.96/0.87,0.88,0.88)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
diff --git a/egs/cifar/v1/local/nnet3/run_resnet_1a.sh b/egs/cifar/v1/local/nnet3/run_resnet_1a.sh
new file mode 100755
index 00000000000..8f41bb96c07
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/run_resnet_1a.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# run_resnet_1a.sh is a quite well-performing resnet.
+#  It includes a form of shrinkage that approximates l2 regularization.
+#  (c.f. --proportional-shrink).
+
+#  Definitely better:
+
+# local/nnet3/compare.sh exp/resnet1a_cifar10
+# System                 resnet1a_cifar10
+# final test accuracy:       0.9481
+# final train accuracy:        0.9992
+# final test objf:          -0.171369
+# final train objf:       -0.00980603
+# num-parameters:            1322730
+
+# local/nnet3/compare.sh exp/resnet1a_cifar100
+# System              resnet1a_cifar100
+# final test accuracy:        0.7478
+# final train accuracy:       0.9446
+# final test objf:           -0.899789
+# final train objf:          -0.22468
+# num-parameters:             1345860
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+dataset=cifar10
+srand=0
+reporting_email=
+affix=1a
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/resnet${affix}_${dataset}
+
+egs=exp/${dataset}_egs2
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+
+  nf1=48
+  nf2=96
+  nf3=256
+  nb3=128
+
+  common="required-time-offsets=0 height-offsets=-1,0,1"
+  res_opts="bypass-source=batchnorm"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-layer name=conv1 height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1
+  res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts
+  res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts
+  conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2
+  res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts
+  res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts
+  conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3
+  res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3
+  output-layer name=output learning-rate-factor=0.1 dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=60 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.optimization.proportional-shrink=50.0 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/cifar/v1/local/nnet3/run_resnet_1b.sh b/egs/cifar/v1/local/nnet3/run_resnet_1b.sh
new file mode 100755
index 00000000000..f8f3b563e6c
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/run_resnet_1b.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+# 1b is as 1a but using more epochs: 100 instead of 60.
+# This helps a bit.
+
+#exp/resnet1b_cifar10: num-iters=133 nj=1..2 num-params=1.3M dim=96->10 combine=-0.01->-0.01 loglike:train/valid[87,132,final]=(-0.13,-0.03,-0.01/-0.27,-0.21,-0.16) accuracy:train/valid[87,132,final]=(0.95,0.99,1.00/0.91,0.94,0.95)
+#exp/resnet1b_cifar100: num-iters=133 nj=1..2 num-params=1.3M dim=96->100 combine=-0.22->-0.19 loglike:train/valid[87,132,final]=(-0.75,-0.27,-0.16/-1.22,-1.06,-0.89) accuracy:train/valid[87,132,final]=(0.78,0.93,0.96/0.67,0.72,0.76)
+
+
+# local/nnet3/compare.sh exp/resnet1a_cifar10 exp/resnet1b_cifar10
+# System                resnet1a_cifar10 resnet1b_cifar10
+# final test accuracy:       0.9481      0.9521
+# final train accuracy:       0.9992      0.9998
+# final test objf:         -0.171369   -0.160283
+# final train objf:      -0.00980603 -0.00672504
+# num-parameters:           1322730     1322730
+
+# local/nnet3/compare.sh exp/resnet1a_cifar100 exp/resnet1b_cifar100
+# System                resnet1a_cifar100 resnet1b_cifar100
+# final test accuracy:       0.7478      0.7597
+# final train accuracy:       0.9446      0.9638
+# final test objf:         -0.899789   -0.889707
+# final train objf:         -0.22468   -0.163996
+# num-parameters:           1345860     1345860
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+dataset=cifar10
+srand=0
+reporting_email=
+affix=1b
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/resnet${affix}_${dataset}
+
+egs=exp/${dataset}_egs2
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+
+  nf1=48
+  nf2=96
+  nf3=256
+  nb3=128
+
+  common="required-time-offsets=0 height-offsets=-1,0,1"
+  res_opts="bypass-source=batchnorm"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-layer name=conv1 height-in=32 height-out=32 time-offsets=-1,0,1 required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=$nf1
+  res-block name=res2 num-filters=$nf1 height=32 time-period=1 $res_opts
+  res-block name=res3 num-filters=$nf1 height=32 time-period=1 $res_opts
+  conv-layer name=conv4 height-in=32 height-out=16 height-subsample-out=2 time-offsets=-1,0,1 $common num-filters-out=$nf2
+  res-block name=res5 num-filters=$nf2 height=16 time-period=2 $res_opts
+  res-block name=res6 num-filters=$nf2 height=16 time-period=2 $res_opts
+  conv-layer name=conv7 height-in=16 height-out=8 height-subsample-out=2 time-offsets=-2,0,2 $common num-filters-out=$nf3
+  res-block name=res8 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res9 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  res-block name=res10 num-filters=$nf3 num-bottleneck-filters=$nb3 height=8 time-period=4 $res_opts
+  channel-average-layer name=channel-average input=Append(2,6,10,14,18,22,24,28) dim=$nf3
+  output-layer name=output learning-rate-factor=0.1 dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --image.augmentation-opts="--horizontal-flip-prob=0.5 --horizontal-shift=0.1 --vertical-shift=0.1 --num-channels=3" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=100 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.optimization.proportional-shrink=50.0 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/cifar/v1/run.sh b/egs/cifar/v1/run.sh
index c4760672169..084a8a53041 100755
--- a/egs/cifar/v1/run.sh
+++ b/egs/cifar/v1/run.sh
@@ -17,6 +17,10 @@ fi
 
 # cifar10 egs preparation
 image/nnet3/get_egs.sh --cmd "$train_cmd" data/cifar10_train data/cifar10_test exp/cifar10_egs
-
 # cifar100 egs preparation
 image/nnet3/get_egs.sh --cmd "$train_cmd" data/cifar100_train data/cifar100_test exp/cifar100_egs
+
+
+# prepare a different version of the egs with 2 instead of 3 archives.
+image/nnet3/get_egs.sh --egs-per-archive 30000 --cmd "$train_cmd" data/cifar10_train data/cifar10_test exp/cifar10_egs2
+image/nnet3/get_egs.sh --egs-per-archive 30000 --cmd "$train_cmd" data/cifar100_train data/cifar100_test exp/cifar100_egs2
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 4a8505d4f3a..fb62d579510 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -260,7 +260,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     do_average = (iter > 0)
 
     raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
-                        "{1}/{2}.mdl - |".format(learning_rate, dir, iter))
+                        "--scale={1} {2}/{3}.mdl - |".format(
+                            learning_rate, shrinkage_value, dir, iter))
 
     if do_average:
         cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
@@ -315,16 +316,14 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         common_train_lib.get_average_nnet_model(
             dir=dir, iter=iter,
             nnets_list=" ".join(nnets_list),
-            run_opts=run_opts,
-            shrink=shrinkage_value)
+            run_opts=run_opts)
 
     else:
         # choose the best model from different jobs
         common_train_lib.get_best_nnet_model(
             dir=dir, iter=iter,
             best_model_index=best_model,
-            run_opts=run_opts,
-            shrink=shrinkage_value)
+            run_opts=run_opts)
 
     try:
         for i in range(1, num_jobs + 1):
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6c5e0d6d834..e18c43645ae 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -78,26 +78,17 @@ def get_successful_models(num_models, log_file_pattern,
 
 
 def get_average_nnet_model(dir, iter, nnets_list, run_opts,
-                           get_raw_nnet_from_am=True, shrink=None):
-    scale = 1.0
-    if shrink is not None:
-        scale = shrink
+                           get_raw_nnet_from_am=True):
 
     next_iter = iter + 1
     if get_raw_nnet_from_am:
-        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- --scale={scale} \
+        out_model = ("""- \| nnet3-am-copy --set-raw-nnet=-  \
                         {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format(
                             dir=dir, iter=iter,
-                            next_iter=next_iter,
-                            scale=scale))
+                            next_iter=next_iter))
     else:
-        if shrink is not None:
-            out_model = """- \| nnet3-copy --scale={scale} \
-                           - {dir}/{next_iter}.raw""".format(
-                                   dir=dir, next_iter=next_iter, scale=scale)
-        else:
-            out_model = "{dir}/{next_iter}.raw".format(dir=dir,
-                                                       next_iter=next_iter)
+        out_model = "{dir}/{next_iter}.raw".format(
+            dir=dir, next_iter=next_iter)
 
     common_lib.execute_command(
         """{command} {dir}/log/average.{iter}.log \
@@ -110,10 +101,7 @@ def get_average_nnet_model(dir, iter, nnets_list, run_opts,
 
 
 def get_best_nnet_model(dir, iter, best_model_index, run_opts,
-                        get_raw_nnet_from_am=True, shrink=None):
-    scale = 1.0
-    if shrink is not None:
-        scale = shrink
+                        get_raw_nnet_from_am=True):
 
     best_model = "{dir}/{next_iter}.{best_model_index}.raw".format(
             dir=dir,
@@ -130,11 +118,11 @@ def get_best_nnet_model(dir, iter, best_model_index, run_opts,
 
     common_lib.execute_command(
         """{command} {dir}/log/select.{iter}.log \
-                nnet3-copy --scale={scale} {best_model} \
+                nnet3-copy {best_model} \
                 {out_model}""".format(command=run_opts.command,
                                       dir=dir, iter=iter,
                                       best_model=best_model,
-                                      out_model=out_model, scale=scale))
+                                      out_model=out_model))
 
 
 def validate_chunk_width(chunk_width):
@@ -530,8 +518,8 @@ def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
     return num_jobs * effective_learning_rate
 
 
-def do_shrinkage(iter, model_file, shrink_saturation_threshold,
-                 get_raw_nnet_from_am=True):
+def should_do_shrinkage(iter, model_file, shrink_saturation_threshold,
+                        get_raw_nnet_from_am=True):
 
     if iter == 0:
         return True
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 010320d9170..319687aa4c0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -77,8 +77,9 @@ def train_new_models(dir, iter, srand, num_jobs,
 
         if image_augmentation_opts:
             image_augmentation_cmd = (
-                'nnet3-egs-augment-image {aug_opts} ark:- ark:- |'.format(
-                aug_opts=image_augmentation_opts))
+                'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |'.format(
+                    srand=k+srand,
+                    aug_opts=image_augmentation_opts))
         else:
             image_augmentation_cmd = ''
 
@@ -95,8 +96,7 @@ def train_new_models(dir, iter, srand, num_jobs,
             """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} """
             """--srand={srand} ark:- ark:- | {aug_cmd} """
             """nnet3-merge-egs --minibatch-size={minibatch_size_str} """
-            """ark:- ark:- |" \
-                    {dir}/{next_iter}.{job}.raw""".format(
+            """ ark:- ark:- |" {dir}/{next_iter}.{job}.raw""".format(
                         command=run_opts.command,
                         train_queue_opt=run_opts.train_queue_opt,
                         dir=dir, iter=iter, srand=iter + srand,
@@ -185,12 +185,14 @@ def train_one_iteration(dir, iter, srand, egs_dir,
 
     if get_raw_nnet_from_am:
         raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} "
-                            "{1}/{2}.mdl - |".format(learning_rate,
-                                                     dir, iter))
+                            "--scale={1} {2}/{3}.mdl - |".format(
+                                learning_rate, shrinkage_value,
+                                dir, iter))
     else:
-        raw_model_string = ("nnet3-copy --learning-rate={lr} "
+        raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} "
                             "{dir}/{iter}.raw - |".format(
-                                lr=learning_rate, dir=dir, iter=iter))
+                                lr=learning_rate, s=shrinkage_value,
+                                dir=dir, iter=iter))
 
     raw_model_string = raw_model_string + dropout_edit_string
 
@@ -240,8 +242,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
             dir=dir, iter=iter,
             nnets_list=" ".join(nnets_list),
             run_opts=run_opts,
-            get_raw_nnet_from_am=get_raw_nnet_from_am,
-            shrink=shrinkage_value)
+            get_raw_nnet_from_am=get_raw_nnet_from_am)
 
     else:
         # choose the best model from different jobs
@@ -249,8 +250,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
             dir=dir, iter=iter,
             best_model_index=best_model,
             run_opts=run_opts,
-            get_raw_nnet_from_am=get_raw_nnet_from_am,
-            shrink=shrinkage_value)
+            get_raw_nnet_from_am=get_raw_nnet_from_am)
 
     try:
         for i in range(1, num_jobs + 1):
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 7c01689e86c..d5c1bc39eaf 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -628,7 +628,8 @@ def __init__(self, first_token, key_to_value, prev_names = None):
         # Here we just list some likely combinations.. you can just add any
         # combinations you want to use, to this list.
         assert first_token in [ 'relu-layer', 'relu-renorm-layer', 'sigmoid-layer',
-                                'tanh-layer', 'relu-batchnorm-layer', 'relu-dropout-layer' ]
+                                'tanh-layer', 'relu-batchnorm-layer', 'relu-dropout-layer',
+                                'relu-batchnorm-dropout-layer' ]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
index e14aca92b3b..12f4979c39a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
@@ -209,7 +209,7 @@ def output_name(self, auxiliary_output = None):
         assert auxiliary_output is None
         # note: the [:-1] is to remove the '-layer'.
         operations = self.layer_type.split('-')[:-1]
-        assert len(operations) > 1
+        assert len(operations) >= 1
         last_operation = operations[-1]
         assert last_operation in ['relu', 'conv',
                                   'renorm', 'batchnorm', 'dropout']
@@ -264,7 +264,6 @@ def generate_cnn_config(self):
                         a.append('{0}={1}'.format(opt_name, value))
                 conv_opts = ' '.join(a)
 
-                configs.append("### Begin convolutional layer '{0}'".format(name))
                 configs.append('component name={0}.conv type=TimeHeightConvolutionComponent '
                                '{1}'.format(name, conv_opts))
                 configs.append('component-node name={0}.conv component={0}.conv '
@@ -305,3 +304,434 @@ def generate_cnn_config(self):
             cur_descriptor = '{0}.{1}'.format(name, operation)
 
         return configs
+
+
+# This class is for lines like the following:
+#
+# res-block name=res1 num-filters=64 height=32 time-period=1
+#
+# It implements a residual block as in ResNets, but with some small differences
+# that make it a little more general-- basically, instead of adding the input to
+# the output, we put a convolutional layer in there but initialize it to the
+# unit matrix and if you want you can give it a relatively small (or even zero)
+# learning rate and max-change.  And there is batch-norm in that path also.
+#
+# The number of filters is the same on the input and output; it is actually
+# redundant to write it in the config file, because given that we know the
+# height, we can work it out from the dimension of the input (as dimension =
+# height * num-filters).  But we allow it to be specified anyway, for clarity.
+#
+# Note: the res-block does not support subsampling or changing the number of
+# filters.  If you want to do that, we recommend that you should do it with a
+# single relu-batchnorm-conv-layer.
+#
+# Here are the most important configuration values, with defaults shown if
+# defaults exist:
+#
+# input='[-1]'    Descriptor giving the input of the layer.
+# height          The input and output height of the image, e.g. 40.  Note: the width
+#                 is associated with the time dimension and is dealt with
+#                 implicitly, so it's not specified here.
+# num-filters     The number of filters on the input and output, e.g. 64.
+#                 It does not have to be specified; if it is not specified,
+#                 we work it out from the input dimension.
+# num-bottleneck-filters   If specified then this will be a 'bottleneck'
+#                 ResBlock, in which there is a 1x1 convolution from
+#                 num-filters->num-bottleneck-filters, a 3x3 convolution
+#                 from num-bottleneck-filters->num-bottleneck-filters, and
+#                 a 1x1 convolution from num-bottleneck-filters->num-filters.
+#
+# time-period=1   Think of this as the stride in the time dimension.  At the
+#                 input of the network will always have time-period=1; then
+#                 after subsampling once in time we'd have time-period=2; then
+#                 after subsampling again we'd have time-period=4.  Because of
+#                 the way nnet3 works, subsampling on the time axis is an
+#                 implicit, not explicit, operation.
+# bypass-source=noop
+#                       The output of this component is Sum(convolution, x), and
+#                       this option controls what 'x' is.  There are 3 options
+#                       here: 'noop', 'input', 'relu' or 'batchnorm'.  'noop' is
+#                       equivalent to 'input' in what it computes; it just
+#                       inserts a 'noop' component in order to make the
+#                       computation more efficient.  For both 'noop' and
+#                       'input', x is the input to this component.  If
+#                       bypass-source=relu then we use the relu of the
+#                       input; if 'batchnorm', then we use the relu+batchnorm of
+#                       the input.
+# allow-zero-padding=true By default this will allow zero-padding in the time
+#                       dimension, meaning that you don't need extra frames at
+#                       the input to compute the output.  There may be ASR
+#                       applications where you want to pad in the time dimension
+#                       with repeats of the first or last frame (as we do for
+#                       TDNNs), where it would be appropriate to write
+#                       allow-zero-padding=false.  Note: the way we have
+#                       set it up, it does zero-padding on the height axis
+#                       regardless
+#
+# Less important config variables:
+#  self-repair-scale=2.0e-05  This affects the ReLu's.  It is a scale on the
+#                            'self-repair' mechanism that nudges the inputs to the
+#                            ReLUs into the appropriate range in cases where
+#                            the unit is active either too little of the time
+#                            (<10%) or too much of the time (>90%).
+#  max-change=0.75           Max-parameter-change constant (per minibatch)
+#                            used for convolutional components.
+#
+#
+# The following natural-gradient-related configuration variables are passed in
+# to the convolution components, if specified:
+#  use-natural-gradient (bool)
+#  rank-in, rank-out    (int)
+#  num-minibatches-history (float)
+#  alpha-in, alpha-out (float)
+
+class XconfigResBlock(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == 'res-block'
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                       'height':-1,
+                       'num-filters':-1,
+                       'num-bottleneck-filters':-1,
+                       'time-period':1,
+                       'self-repair-scale': 2.0e-05,
+                       'max-change': 0.75,
+                       'allow-zero-padding': True,
+                       'bypass-source' : 'noop',
+                       # the following are not really inspected by this level of
+                       # code, just passed through (but not if left at '').
+                       'param-stddev':'', 'bias-stddev':'',
+                       'use-natural-gradient':'',
+                       'rank-in':'', 'rank-out':'',
+                       'num-minibatches-history':'',
+                       'alpha-in':'', 'alpha-out':''}
+
+    def set_derived_configs(self):
+        # set 'num-filters' or check it..
+        input_dim = self.descriptors['input']['dim']
+        height = self.config['height']
+
+        cur_num_filters = self.config['num-filters']
+        if cur_num_filters == -1:
+            if input_dim % height != 0:
+                raise RuntimeError("Specified image height {0} does not "
+                                   "divide the input dim {1}".format(
+                                       height, input_dim))
+            self.config['num-filters'] = input_dim / height
+        elif input_dim != cur_num_filters * height:
+            raise RuntimeError("Expected the input-dim to equal "
+                               "height={0} * num-filters={1} = {2}, but "
+                               "it is {3}".format(
+                                   height, cur_num_filters,
+                                   height * cur_num_filters,
+                                   input_dim));
+
+    def check_configs(self):
+        # we checked the dimensions in set_derived_configs.
+        if not self.config['bypass-source'] in [
+                'input', 'noop', 'relu', 'batchnorm' ]:
+            raise RuntimeError("Expected direct-convolution-source to "
+                               "be input, relu or batchnorm, got: {1}".format(
+                                   self.config['direct-convolution-source']))
+
+    def auxiliary_outputs(self):
+        return []
+
+    def output_name(self, auxiliary_output = None):
+        bypass_source = self.config['bypass-source']
+        b = self.config['num-bottleneck-filters']
+        conv = ('{0}.conv2' if b <= 0 else '{0}.conv3').format(self.name)
+        if bypass_source == 'input':
+            residual = self.descriptors['input']['final-string']
+        elif bypass_source == 'noop':
+            # we let the noop be the sum of the convolutional part and the
+            # input, so just return the output of the no-op component.
+            return '{0}.noop'.format(self.name)
+        elif bypass_source == 'relu':
+            residual = '{0}.relu1'.format(self.name)
+        else:
+            assert bypass_source == 'batchnorm'
+            residual = '{0}.batchnorm1'.format(self.name)
+
+        return 'Sum({0}, {1})'.format(conv, residual)
+
+    def output_dim(self, auxiliary_output = None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        b = self.config['num-bottleneck-filters']
+        if b <= 0:
+            config_lines = self.generate_normal_resblock_config()
+        else:
+            config_lines = self.generate_bottleneck_resblock_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in CNN initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # generate_normal_resblock_config is a convenience function to generate the
+    # res-block config (the non-bottleck version).
+    #
+    # The main path inside the res-block in the non-bottleneck case is as
+    # follows:
+    #
+    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
+    #
+    # We put the relu before the batchnorm because we think it makes more sense;
+    # because the Torch people seemed to find that this works better
+    # (https://github.com/gcr/torch-residual-networks/issues/5);
+    # and because in our batchnorm component we haven't implemented the beta and
+    # gamma; these would be essential to having it work before relu, but
+    # when before a convolution or linear component, they add no extra modeling
+    # power.
+    #
+    # The output of the res-block can be the sum of the last convolutional
+    # component (conv2), with the input.  However, the option ('bypass-source')
+    # controls whether we sum with the raw input, or its relu or relu+batchnorm.
+    # If the term is going to be the raw input, we give the option ('noop') and
+    # to cache the output sum via a NoOpComponent)-- because due to how nnet3
+    # works, if we didn't do this, redundant summing operations would take
+    # place.
+    def generate_normal_resblock_config(self):
+        configs = []
+
+        name = self.name
+        num_filters = self.config['num-filters']
+        assert self.config['num-bottleneck-filters'] == -1
+        height = self.config['height']
+        input_descriptor = self.descriptors['input']['final-string']
+        allow_zero_padding = self.config['allow-zero-padding']
+        time_period = self.config['time-period']
+
+        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
+        cur_descriptor = input_descriptor
+        for n in [1, 2]:
+            # the ReLU
+            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
+                           'dim={2} self-repair-scale={3}'.format(
+                               name, n, num_filters * height,
+                               self.config['self-repair-scale']))
+            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+
+            cur_descriptor = '{0}.relu{1}'.format(name, n)
+
+            # the batch-norm
+            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
+                               'block-dim={3}'.format(
+                                   name, n, num_filters * height,
+                                   num_filters))
+            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)
+
+
+            # the convolution.
+            a = []
+            for opt_name in [
+                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
+                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
+                    'alpha-in', 'alpha-out' ]:
+                value = self.config[opt_name]
+                if value != '':
+                        a.append('{0}={1}'.format(opt_name, value))
+            conv_opts = ('height-in={h} height-out={h} height-offsets=-1,0,1 time-offsets=-{p},0,{p} '
+                         'num-filters-in={f} num-filters-out={f} {r} {o}'.format(
+                             h=height, p=time_period, f=num_filters,
+                             r=('required-time-offsets=0' if allow_zero_padding else ''),
+                             o=' '.join(a)))
+
+            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
+                           '{2}'.format(name, n, conv_opts))
+            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+            cur_descriptor = '{0}.conv{1}'.format(name, n)
+
+
+
+        if self.config['bypass-source'] == 'noop':
+            dim = self.descriptors['input']['dim']
+            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
+                name, dim))
+            configs.append('component-node name={0}.noop component={0}.noop '
+                           'input=Sum({1}, {0}.conv2)'.format(name,
+                                                              input_descriptor))
+
+        # Note: the function 'output_name' is responsible for returning the
+        # descriptor corresponding to the output of the network.
+        return configs
+
+
+
+    # generate_bottleneck_resblock_config is a convenience function to generate the
+    # res-block config (this is the bottleneck version, where there is
+    # a 3x3 kernel with a smaller number of filters than at the input and output,
+    # sandwiched between two 1x1 kernels.
+    #
+    # The main path inside the res-block in the bottleneck case is as follows:
+    #
+    # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 ->
+    #   relu3 -> batchnorm3 -> conv3
+    #
+    # power.
+    #
+    # The output of the res-block can be the sum of the last convolutional
+    # component (conv3), with the input.  However we give the option
+    # ('bypass-source') to sum with the raw input, or its relu or
+    # relu+batchnorm.  If the term is going to be the raw input, we give the
+    # option ('noop') and to cache the output sum via a NoOpComponent)-- because
+    # due to how nnet3 works, if we didn't do this, redundant summing operations
+    # would take place.
+    def generate_bottleneck_resblock_config(self):
+        configs = []
+
+        name = self.name
+        num_filters = self.config['num-filters']
+        num_bottleneck_filters = self.config['num-bottleneck-filters']
+        assert num_bottleneck_filters > 0
+        height = self.config['height']
+        input_descriptor = self.descriptors['input']['final-string']
+        allow_zero_padding = self.config['allow-zero-padding']
+        time_period = self.config['time-period']
+
+        # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2
+        cur_descriptor = input_descriptor
+        cur_num_filters = num_filters
+
+        for n in [1, 2, 3]:
+            # the ReLU
+            configs.append('component name={0}.relu{1} type=RectifiedLinearComponent '
+                           'dim={2} self-repair-scale={3}'.format(
+                               name, n, cur_num_filters * height,
+                               self.config['self-repair-scale']))
+            configs.append('component-node name={0}.relu{1} component={0}.relu{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+
+            cur_descriptor = '{0}.relu{1}'.format(name, n)
+
+            # the batch-norm
+            configs.append('component name={0}.batchnorm{1}  type=BatchNormComponent dim={2} '
+                               'block-dim={3}'.format(
+                                   name, n, cur_num_filters * height,
+                                   cur_num_filters))
+            configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+            cur_descriptor = '{0}.batchnorm{1}'.format(name, n)
+
+
+            # the convolution.
+            a = []
+            for opt_name in [
+                    'param-stddev', 'bias-stddev', 'use-natural-gradient',
+                    'max-change', 'rank-in', 'rank-out', 'num-minibatches-history',
+                    'alpha-in', 'alpha-out' ]:
+                value = self.config[opt_name]
+                if value != '':
+                        a.append('{0}={1}'.format(opt_name, value))
+
+            height_offsets = ('-1,0,1' if n == 2 else '0')
+            time_offsets = ('-{t},0,{t}'.format(t=time_period) if n == 2 else '0')
+            num_filters_in = cur_num_filters
+            num_filters_out = (num_filters if n == 3 else num_bottleneck_filters)
+            cur_num_filters = num_filters_out
+
+            conv_opts = ('height-in={h} height-out={h} height-offsets={ho} time-offsets={to} '
+                         'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format(
+                             h=height, ho=height_offsets, to=time_offsets,
+                             fi=num_filters_in, fo=num_filters_out,
+                             r=('required-time-offsets=0' if allow_zero_padding else ''),
+                             o=' '.join(a)))
+
+            configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent '
+                           '{2}'.format(name, n, conv_opts))
+            configs.append('component-node name={0}.conv{1} component={0}.conv{1} '
+                           'input={2}'.format(name, n, cur_descriptor))
+            cur_descriptor = '{0}.conv{1}'.format(name, n)
+
+
+
+        if self.config['bypass-source'] == 'noop':
+            dim = self.descriptors['input']['dim']
+            configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format(
+                name, dim))
+            configs.append('component-node name={0}.noop component={0}.noop '
+                           'input=Sum({1}, {0}.conv3)'.format(name,
+                                                              input_descriptor))
+
+        # Note: the function 'output_name' is responsible for returning the
+        # descriptor corresponding to the output of the network.
+        return configs
+
+
+# This layer just maps to a single component, a SumBlockComponent.  It's for
+# doing channel averaging at the end of neural networks.  See scripts for
+# examples of how to use it.
+# An example line using this layer is:
+# channel-average-layer name=channel-average input=Append(2, 4, 6, 8) dim=64
+
+# the configuration value 'dim' is the output dimension of this layer.
+# The input dimension is expected to be a multiple of 'dim'.  The output
+# will be the average of 'dim'-sized blocks of the input.
+class ChannelAverageLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "channel-average-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                       'dim': -1 }
+
+    def set_derived_configs(self):
+        pass
+
+    def check_configs(self):
+        input_dim = self.descriptors['input']['dim']
+        dim = self.config['dim']
+        if dim <= 0:
+            raise RuntimeError("dim must be specified and > 0.")
+        if input_dim % dim != 0:
+            raise RuntimeError("input-dim={0} is not a multiple of dim={1}".format(
+                input_dim, dim))
+
+    def auxiliary_outputs(self):
+        return []
+
+    def output_name(self, auxiliary_output = None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output = None):
+        assert auxiliary_output is None
+        return self.config['dim']
+
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_channel_average_config()
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                ans.append((config_name, line))
+        return ans
+
+    def generate_channel_average_config(self):
+        configs = []
+        name = self.name
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        dim = self.config['dim']
+        # choose the scale that makes it an average rather than a sum.
+        scale = dim * 1.0 / input_dim
+        configs.append('component name={0} type=SumBlockComponent input-dim={1} '
+                       'output-dim={2} scale={3}'.format(name, input_dim,
+                                                         dim, scale))
+        configs.append('component-node name={0} component={0} input={1}'.format(
+            name, input_descriptor))
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 0ab4a5e5f63..a7d5ece6ce9 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -21,6 +21,8 @@
         'output-layer' : xlayers.XconfigOutputLayer,
         'relu-layer' : xlayers.XconfigBasicLayer,
         'relu-renorm-layer' : xlayers.XconfigBasicLayer,
+        'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
+        'relu-dropout-layer': xlayers.XconfigBasicLayer,
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
@@ -32,15 +34,17 @@
         'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer,
         'relu-conv-layer': xlayers.XconfigConvLayer,
+        'conv-layer': xlayers.XconfigConvLayer,
         'conv-relu-layer': xlayers.XconfigConvLayer,
         'relu-conv-renorm-layer': xlayers.XconfigConvLayer,
         'conv-relu-renorm-layer': xlayers.XconfigConvLayer,
-        'relu-conv-batchnorm-layer': xlayers.XconfigConvLayer,
+        'batchnorm-conv-relu-layer': xlayers.XconfigConvLayer,
+        'relu-batchnorm-conv-layer': xlayers.XconfigConvLayer,
         'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer,
         'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer,
         'conv-relu-dropout-layer': xlayers.XconfigConvLayer,
-        'relu-dropout-layer': xlayers.XconfigBasicLayer
-
+        'res-block': xlayers.XconfigResBlock,
+        'channel-average-layer': xlayers.ChannelAverageLayer
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6bc51dcbd3f..59185235ba1 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -435,7 +435,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             shrinkage_value = 1.0
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
-                                   if common_train_lib.do_shrinkage(
+                                   if common_train_lib.should_do_shrinkage(
                                         iter, model_file,
                                         args.shrink_saturation_threshold)
                                    else 1
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 6c7123f7fa6..0fd0cc04d48 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -80,6 +80,18 @@ def get_args():
                         rule as accepted by the --minibatch-size option of
                         nnet3-merge-egs; run that program without args to see
                         the format.""")
+    parser.add_argument("--trainer.optimization.proportional-shrink", type=float,
+                        dest='proportional_shrink', default=0.0,
+                        help="""If nonzero, this will set a shrinkage (scaling)
+                        factor for the parameters, whose value is set as:
+                        shrink-value=(1.0 - proportional-shrink * learning-rate), where
+                        'learning-rate' is the learning rate being applied
+                        on the current iteration, which will vary from
+                        initial-effective-lrate*num-jobs-initial to
+                        final-effective-lrate*num-jobs-final.
+                        Unlike for train_rnn.py, this is applied unconditionally,
+                        it does not depend on saturation of nonlinearities.
+                        Can be used to roughly approximate l2 regularization.""")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
@@ -320,6 +332,17 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                + (args.num_jobs_final - args.num_jobs_initial)
                                * float(iter) / num_iters)
 
+        lrate = learning_rate(iter, current_num_jobs,
+                              num_archives_processed)
+        shrink_value = 1.0
+        if args.proportional_shrink != 0.0:
+            shrink_value = 1.0 - (args.proportional_shrink * lrate)
+            if shrink_value <= 0.5:
+                raise Exception("proportional-shrink={0} is too large, it gives "
+                                "shrink-value={1}".format(args.proportional_shrink,
+                                                          shrink_value))
+
+
         if args.stage <= iter:
             train_lib.common.train_one_iteration(
                 dir=args.dir,
@@ -329,8 +352,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 num_jobs=current_num_jobs,
                 num_archives_processed=num_archives_processed,
                 num_archives=num_archives,
-                learning_rate=learning_rate(iter, current_num_jobs,
-                                            num_archives_processed),
+                learning_rate=lrate,
                 dropout_edit_string=common_train_lib.get_dropout_edit_string(
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
@@ -339,6 +361,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 frames_per_eg=args.frames_per_eg,
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
+                shrinkage_value=shrink_value,
                 shuffle_buffer_size=args.shuffle_buffer_size,
                 run_opts=run_opts,
                 get_raw_nnet_from_am=False,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 60d1c7fd5fe..812be8b95f3 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -386,7 +386,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             shrinkage_value = 1.0
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
-                                   if common_train_lib.do_shrinkage(
+                                   if common_train_lib.should_do_shrinkage(
                                         iter, model_file,
                                         args.shrink_saturation_threshold,
                                         get_raw_nnet_from_am=False)
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index e8c044d679a..8405244a7ae 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -382,7 +382,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             shrinkage_value = 1.0
             if args.shrink_value != 1.0:
                 shrinkage_value = (args.shrink_value
-                                   if common_train_lib.do_shrinkage(
+                                   if common_train_lib.should_do_shrinkage(
                                         iter, model_file,
                                         args.shrink_saturation_threshold)
                                    else 1
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 5b72a62e716..3b02b266a01 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -97,6 +97,10 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
                           int32_cuda num_row_blocks, int32_cuda num_col_blocks,
                           float *dst, MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                            MatrixDim src_dim, double *dst, MatrixDim dst_dim);
+void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                            MatrixDim src_dim, float *dst, MatrixDim dst_dim);
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                             MatrixDim mat_dim, const double *mat2,
                             int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 6df0e5af9db..b1a9bb1819a 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -578,6 +578,22 @@ static void _add_mat_blocks(Real alpha, const Real* src,
     }
 }
 
+template<typename Real>
+__global__
+static void _add_mat_repeated(Real alpha, const Real* src,
+                              MatrixDim src_dim, Real* dst,
+                              MatrixDim dst_dim) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda src_i = i % src_dim.cols,
+      src_j = j % src_dim.rows,
+      dst_index = i + j * dst_dim.stride,
+      src_index = src_i + src_j * src_dim.stride;
+  if (i < dst_dim.cols && j < dst_dim.rows)
+    dst[dst_index] += alpha * src[src_index];
+}
+
+
 template<typename Real>
 __global__
 static void _add_mat_blocks_trans(Real alpha, const Real* src,
@@ -3558,6 +3574,12 @@ void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src,
   }
 }
 
+void cudaF_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha, const float* src,
+                            MatrixDim src_dim, float *dst, MatrixDim dst_dim) {
+  _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+}
+
+
 void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
                                const float *C, float *dst, MatrixDim d,
                                int stride_a, int stride_b, int stride_c) {
@@ -4217,6 +4239,11 @@ void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src,
   }
 }
 
+void cudaD_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha, const double* src,
+                            MatrixDim src_dim, double *dst, MatrixDim dst_dim) {
+  _add_mat_repeated<<<Gr,Bl>>>(alpha, src, src_dim, dst, dst_dim);
+}
+
 void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
                                const double *B, const double *C, double *dst,
                                MatrixDim d, int stride_a, int stride_b,
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index d2a79f471c8..a2c4aaceb3d 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -158,6 +158,16 @@ inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
   cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
                        d, src_stride, A_trans);
 }
+inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, double alpha,
+                                  const double *src, MatrixDim src_dim,
+                                  double *dst, MatrixDim dst_dim) {
+  cudaD_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+}
+inline void cuda_add_mat_repeated(dim3 Gr, dim3 Bl, float alpha,
+                                  const float *src, MatrixDim src_dim,
+                                  float *dst, MatrixDim dst_dim) {
+  cudaF_add_mat_repeated(Gr, Bl, alpha, src, src_dim, dst, dst_dim);
+}
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
                                   MatrixDim mat_dim, const double *mat2,
                                   int mat2_row_stride, int mat2_col_stride,
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 2157c97156f..5e49b483c61 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -1223,39 +1223,95 @@ static void UnitTestCuMatrixAddMat() {
   }
 }
 
-template<typename Real>
-static void UnitTestCuMatrixAddMatBlocks() {
-  int32 num_row_blocks = 10, num_col_blocks = 20;
-  Matrix<Real> Ha1(100, 100), Ha2(100, 100);
-  Matrix<Real> Hb(100 * num_row_blocks, 100 * num_col_blocks);
-  Ha1.SetRandn();
-  Ha2.SetRandn();
-  Hb.SetRandn();
 
-  CuMatrix<Real> Da1(100, 100), Da2(100, 100);
-  CuMatrix<Real> Db(100 * num_row_blocks, 100 * num_col_blocks);
-  Da1.CopyFromMat(Ha1);
-  Da2.CopyFromMat(Ha2);
-  Db.CopyFromMat(Hb);
+// this tests the branch of AddMatBlocks() that is taken when
+// 'this' has a smaller dimension than 'src' (it sums).
+template<typename Real>
+static void UnitTestCuMatrixAddMatBlocks1() {
+  for (int32 l = 0; l < 5; l++) {
+    int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20);
+    int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100);
+    BaseFloat alpha = RandInt(3, 10);
+    CuMatrix<Real> dst(block_rows, block_cols);
+    dst.SetRandn();
+    CuMatrix<Real> src(num_row_blocks * block_rows,
+                       num_col_blocks * block_cols);
+    src.SetRandn();
 
-  for (int32 i = 0; i < num_row_blocks; i++) {
-    for (int32 j = 0; j < num_col_blocks; j++) {
-      SubMatrix<Real> Hs(Hb.Range(i * 100, 100, j * 100, 100));
-      Ha1.AddMat(0.5, Hs, kNoTrans);
-      Ha2.AddMat(0.5, Hs, kTrans);
+    CuMatrix<Real> dst_copy(dst);
+    for (int32 rb = 0; rb < num_row_blocks; rb++) {
+      for (int32 cb = 0; cb < num_col_blocks; cb++) {
+        CuSubMatrix<Real> src_part(src,
+                                   rb * block_rows, block_rows,
+                                   cb * block_cols, block_cols);
+        dst_copy.AddMat(alpha, src_part);
+      }
     }
+    dst.AddMatBlocks(alpha, src);
+    AssertEqual(dst, dst_copy);
   }
+}
 
-  Da1.AddMatBlocks(0.5, Db, kNoTrans);
-  Da2.AddMatBlocks(0.5, Db, kTrans);
-  Matrix<Real> Ha11(100, 100);
-  Da1.CopyToMat(&Ha11);
-  AssertEqual(Ha1,Ha11);
-  Matrix<Real> Ha22(100, 100);
-  Da2.CopyToMat(&Ha22);
-  AssertEqual(Ha2,Ha22);
+// this is as UnitTestCuMatrixAddMatBlocks1, but tests with transpose.
+template<typename Real>
+static void UnitTestCuMatrixAddMatBlocks1Trans() {
+  for (int32 l = 0; l < 5; l++) {
+    int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20);
+    int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100);
+    BaseFloat alpha = RandInt(3, 10);
+    CuMatrix<Real> dst(block_cols, block_rows);
+    dst.SetRandn();
+    CuMatrix<Real> src(num_row_blocks * block_rows,
+                       num_col_blocks * block_cols);
+    src.SetRandn();
+
+    CuMatrix<Real> dst_copy(dst);
+    for (int32 rb = 0; rb < num_row_blocks; rb++) {
+      for (int32 cb = 0; cb < num_col_blocks; cb++) {
+        CuSubMatrix<Real> src_part(src,
+                                   rb * block_rows, block_rows,
+                                   cb * block_cols, block_cols);
+        dst_copy.AddMat(alpha, src_part, kTrans);
+      }
+    }
+    dst.AddMatBlocks(alpha, src, kTrans);
+    AssertEqual(dst, dst_copy);
+  }
 }
 
+
+// this tests the branch of AddMatBlocks() that is taken when
+// 'this' has a larger dimension than 'src'.  In this case, it does
+// a broadcasting rather than a summing operation.
+template<typename Real>
+static void UnitTestCuMatrixAddMatBlocks2() {
+  for (int32 l = 0; l < 5; l++) {
+    int32 num_row_blocks = RandInt(1, 10), num_col_blocks = RandInt(1, 20);
+    int32 block_rows = RandInt(1, 100), block_cols = RandInt(1, 100);
+    BaseFloat alpha = RandInt(3, 10);
+    CuMatrix<Real> src(block_rows, block_cols);
+    src.SetRandn();
+    CuMatrix<Real> dst(num_row_blocks * block_rows,
+                       num_col_blocks * block_cols);
+    src.SetRandn();
+
+    CuMatrix<Real> dst_copy(dst);
+    for (int32 rb = 0; rb < num_row_blocks; rb++) {
+      for (int32 cb = 0; cb < num_col_blocks; cb++) {
+        CuSubMatrix<Real> dst_copy_part(dst_copy,
+                                        rb * block_rows, block_rows,
+                                        cb * block_cols, block_cols);
+        dst_copy_part.AddMat(alpha, src);
+      }
+    }
+    dst.AddMatBlocks(alpha, src);
+    AssertEqual(dst, dst_copy);
+  }
+}
+
+
+
+
 template<typename Real>
 static void UnitTestCuMatrixReduceSum() {
   int32 M = 100 + Rand() % 300, N = 100 + Rand() % 300;
@@ -2646,7 +2702,9 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixMulRowsVec<Real>();
   UnitTestCuMatrixDivRowsVec<Real>();
   UnitTestCuMatrixAddMat<Real>();
-  UnitTestCuMatrixAddMatBlocks<Real>();
+  UnitTestCuMatrixAddMatBlocks1<Real>();
+  UnitTestCuMatrixAddMatBlocks1Trans<Real>();
+  UnitTestCuMatrixAddMatBlocks2<Real>();
   UnitTestCuMatrixReduceSum<Real>();
   UnitTestCuMatrixReduceMax<Real>();
   UnitTestCuMatrixReduceMin<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index cfa570233c3..91e140e6bcd 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -912,6 +912,7 @@ void CuMatrixBase<Real>::DivRowsVec(const CuVectorBase<Real> &div) {
   }
 }
 
+
 template<typename Real>
 void CuMatrixBase<Real>::InvertElements() {
 #if HAVE_CUDA == 1
@@ -969,43 +970,81 @@ template<typename Real>
 void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
                                       MatrixTransposeType transA) {
   if (num_rows_ == 0 || num_cols_ == 0) return;
-  int32 num_row_blocks, num_col_blocks;
-  if (transA == kNoTrans) {
-    KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
-    num_row_blocks = A.Mat().NumRows() / num_rows_;
-    num_col_blocks = A.Mat().NumCols() / num_cols_;
-  } else {
-    KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
-    num_row_blocks = A.Mat().NumRows() / num_cols_;
-    num_col_blocks = A.Mat().NumCols() / num_rows_;
-  }
-#if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
-    Timer tim;
-    dim3 dimGrid, dimBlock;
-    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                          &dimGrid, &dimBlock);
-    cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
-                        num_col_blocks, data_, Dim(), A.Stride(),
-                        (transA == kTrans ? 1 : 0));
-    CU_SAFE_CALL(cudaGetLastError());
 
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    int32 nr, nc;
+  if (A.NumRows() >= num_rows_ && A.NumCols() >= num_cols_) {
+    // This is the "summing", not broadcasting, version of AddMatBlocks.
+    // It supports both regular and transposed operation.
+    int32 num_row_blocks, num_col_blocks;
     if (transA == kNoTrans) {
-      nr = num_rows_;
-      nc = num_cols_;
+      KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
+      num_row_blocks = A.Mat().NumRows() / num_rows_;
+      num_col_blocks = A.Mat().NumCols() / num_cols_;
     } else {
-      nr = num_cols_;
-      nc = num_rows_;
+      KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
+      num_row_blocks = A.Mat().NumRows() / num_cols_;
+      num_col_blocks = A.Mat().NumCols() / num_rows_;
+    }
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      Timer tim;
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
+      cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
+                          num_col_blocks, data_, Dim(), A.Stride(),
+                          (transA == kTrans ? 1 : 0));
+      CU_SAFE_CALL(cudaGetLastError());
+
+      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+    } else
+#endif
+    {
+      int32 nr, nc;
+      if (transA == kNoTrans) {
+        nr = num_rows_;
+        nc = num_cols_;
+      } else {
+        nr = num_cols_;
+        nc = num_rows_;
+      }
+      for (int32 i = 0; i < num_row_blocks; i++) {
+        for (int32 j = 0; j < num_col_blocks; j++) {
+          Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
+                       transA);
+        }
+      }
     }
-    for (int32 i = 0; i < num_row_blocks; i++) {
-      for (int32 j = 0; j < num_col_blocks; j++) {
-        Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
-                     transA);
+  } else {
+    // This is the "broadcasting" version of AddMatBlocks, where
+    // *this is larger than src.
+    if (!(num_rows_ % A.NumRows() == 0 && num_cols_ % A.NumCols() == 0))
+      KALDI_ERR << "Invalid sizes of arguments";
+    if (transA != kNoTrans)
+      KALDI_ERR << "Transposed operation not supported currently.";
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+      Timer tim;
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
+      cuda_add_mat_repeated(dimGrid, dimBlock, alpha,
+                            A.data_, A.Dim(), data_, Dim());
+      CU_SAFE_CALL(cudaGetLastError());
+      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+    } else
+#endif
+    {
+      const MatrixBase<Real> &src_mat = A.Mat(),
+          &this_mat = this->Mat();
+      for (int32 row_offset = 0; row_offset < NumRows();
+           row_offset += src_mat.NumRows()) {
+        for (int32 col_offset = 0; col_offset < NumCols();
+             col_offset += src_mat.NumCols()) {
+          SubMatrix<Real> this_part(this_mat,
+                                    row_offset, src_mat.NumRows(),
+                                    col_offset, src_mat.NumCols());
+          this_part.AddMat(alpha, src_mat);
+        }
       }
     }
   }
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 0a4c4b0669e..e8823793cc3 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -421,9 +421,25 @@ class CuMatrixBase {
   void AddMat(Real alpha, const CuMatrixBase<Real> &A,
               MatrixTransposeType trans = kNoTrans);
 
-  /// if A.NumRows() is multiple of (*this)->NumRows and A.NumCols() is multiple of (*this)->NumCols
-  /// divide A into blocks of the same size as (*this) and add them to *this (times alpha)
-  void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType trans = kNoTrans);
+
+  /// This function is like AddMat (it does *this += alpha * src),
+  /// except that it supports cases where *this and src have
+  /// different dimension.  There are two allowed cases:
+  ///
+  ///  (1) *this is larger than src; we do a broadcasting operation.  *this must
+  ///       have NumRows() == a * src.NumRows() and NumCols() == b *
+  ///       src.NumCols() for integer a >= 1, b >= 1.  *this will be treated as
+  ///       a being made up of of blocks with the same size as src, and to each
+  ///       block we'll add alpha * src.  This case does not support trans ==
+  ///       kTrans.
+  ///
+  ///  (2) *this is smaller than src; we sum.  src.NumRows() must == a *
+  ///      this->NumRows(), and src.NumCols() must == b * this->NumCols(), for a
+  ///      >= 1, b >= 1.  In this case, src will be treated as being made up of
+  ///      blocks with the same size as *this, and to *this we will add the
+  ///      summation of all of those blocks.
+  void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+                    MatrixTransposeType trans = kNoTrans);
 
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc
index 8f94f27d4dd..29fbf2c3be0 100644
--- a/src/featbin/copy-feats.cc
+++ b/src/featbin/copy-feats.cc
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
                 "(only currently supported for wxfilename, i.e. archive/script,"
                 "output)");
     po.Register("compression-method", &compression_method_in,
-                "Only relevant if --compress=true; the method (1 through 6) to "
+                "Only relevant if --compress=true; the method (1 through 7) to "
                 "compress the matrix.  Search for CompressionMethod in "
                 "src/matrix/compressed-matrix.h.");
     po.Register("write-num-frames", &num_frames_wspecifier,
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 89869c39dbd..5c0187a267a 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -222,7 +222,7 @@ void NnetCombiner::PrintParams(const VectorBase<double> &params) const {
   int32 num_effective_nnets = nnet_params_.NumRows();
   if (num_effective_nnets != num_real_input_nnets_)
     KALDI_LOG << "Above, only " << num_effective_nnets << " weights were "
-              "printed due to the the --num-effective-nnets option; "
+              "printed due to the the --max-effective-inputs option; "
               "there were " << num_real_input_nnets_ << " actual input nnets. "
               "Each weight corresponds to a weighted average over a range of "
               "nnets in the sequence (with triangular bins)";
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 8a8a10d9475..19b86bbd482 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -99,8 +99,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new NormalizeComponent();
   } else if (component_type == "PnormComponent") {
     ans = new PnormComponent();
-  } else if (component_type == "SumReduceComponent") {
-    ans = new SumReduceComponent();
   } else if (component_type == "AffineComponent") {
     ans = new AffineComponent();
   } else if (component_type == "NaturalGradientAffineComponent") {
@@ -161,6 +159,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new BatchNormComponent();
   } else if (component_type == "TimeHeightConvolutionComponent") {
     ans = new TimeHeightConvolutionComponent();
+  } else if (component_type == "SumBlockComponent") {
+    ans = new SumBlockComponent();
   }
   if (ans != NULL) {
     KALDI_ASSERT(component_type == ans->Type());
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index 1b07befdf95..d8ed2380143 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -87,6 +87,32 @@ std::string TimeHeightConvolutionComponent::Info() const {
 }
 
 
+void TimeHeightConvolutionComponent::InitUnit() {
+  if (model_.num_filters_in != model_.num_filters_out) {
+    KALDI_ERR << "You cannot specify init-unit if the num-filters-in "
+              << "and num-filters-out differ.";
+  }
+  size_t i;
+  int32 zero_offset = 0;
+  for (i = 0; i < model_.offsets.size(); i++) {
+    if (model_.offsets[i].time_offset == 0 &&
+        model_.offsets[i].height_offset == 0) {
+      zero_offset = i;
+      break;
+    }
+  }
+  if (i == model_.offsets.size())  // did not break.
+    KALDI_ERR << "You cannot specify init-unit if the model does "
+              << "not have the offset (0, 0).";
+
+  CuSubMatrix<BaseFloat> zero_offset_block(
+      linear_params_, 0, linear_params_.NumRows(),
+      zero_offset * model_.num_filters_in, model_.num_filters_in);
+
+  KALDI_ASSERT(zero_offset_block.NumRows() == zero_offset_block.NumCols());
+  zero_offset_block.AddToDiag(1.0);  // set this block to the unit matrix.
+}
+
 void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   // 1. Config values inherited from UpdatableComponent.
   InitLearningRatesFromConfig(cfl);
@@ -169,16 +195,22 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
 
   // 3. Parameter-initialization configs.
   BaseFloat param_stddev = -1, bias_stddev = 0.0;
+  bool init_unit = false;
   cfl->GetValue("param-stddev", &param_stddev);
   cfl->GetValue("bias-stddev", &bias_stddev);
+  cfl->GetValue("init-unit", &init_unit);
   if (param_stddev < 0.0) {
     param_stddev = 1.0 / sqrt(model_.num_filters_in *
                               model_.offsets.size());
   }
   // initialize the parameters.
   linear_params_.Resize(model_.ParamRows(), model_.ParamCols());
-  linear_params_.SetRandn();
-  linear_params_.Scale(param_stddev);
+  if (!init_unit) {
+    linear_params_.SetRandn();
+    linear_params_.Scale(param_stddev);
+  } else {
+    InitUnit();
+  }
   bias_params_.Resize(model_.num_filters_out);
   bias_params_.SetRandn();
   bias_params_.Scale(bias_stddev);
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index a48987213af..59442504444 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -120,6 +120,12 @@ namespace nnet3 {
                       filters; this value will ensure that the output has
                       unit stddev if the input has unit stddev.
       bias-stddev     Standard deviation of bias terms.  default=0.0.
+      init-unit       Defaults to false.  If true, it is required that
+                      num-filters-in equal num-filters-out and there should
+                      exist a (height, time) offset in the model equal to (0,
+                      0).  We will initialize the parameter matrix to be
+                      equivalent to the identity transform.  In this case,
+                      param-stddev is ignored.
 
 
    Natural-gradient related options are below; you won't normally have to
@@ -308,7 +314,9 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
       const CuMatrixBase<BaseFloat> &in_value,
       const CuMatrixBase<BaseFloat> &out_deriv);
 
-
+  // Function called to initialize linear_params_ if init-unit=true in the config
+  // line.
+  void InitUnit();
 
   time_height_convolution::ConvolutionModel model_;
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 27482678235..da19b477337 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -217,84 +217,10 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<DropoutPerFrame>");
   WriteBasicType(os, binary, dropout_per_frame_);
   WriteToken(os, binary, "<TestMode>");
-  WriteBasicType(os, binary, test_mode_); 
+  WriteBasicType(os, binary, test_mode_);
   WriteToken(os, binary, "</DropoutComponent>");
 }
 
-void SumReduceComponent::Init(int32 input_dim, int32 output_dim)  {
-  input_dim_ = input_dim;
-  output_dim_ = output_dim;
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 &&
-               input_dim_ % output_dim_ == 0);
-}
-
-void SumReduceComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 input_dim = 0;
-  int32 output_dim = 0;
-  bool ok = cfl->GetValue("output-dim", &output_dim) &&
-      cfl->GetValue("input-dim", &input_dim);
-  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(input_dim, output_dim);
-}
-
-
-void* SumReduceComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(out->NumRows() == in.NumRows() && in.NumCols() == input_dim_
-               && out->NumCols() == output_dim_);
-  int32 num_blocks = input_dim_ / output_dim_;
-  for (int32 i = 0; i < num_blocks; i++) {
-    CuSubMatrix<BaseFloat> in_block(in, 0, in.NumRows(),
-                                    i * output_dim_, output_dim_);
-    if (i == 0)
-      out->CopyFromMat(in_block);
-    else
-      out->AddMat(1.0, in_block);
-  }
-  return NULL;
-}
-
-void SumReduceComponent::Backprop(const std::string &debug_info,
-                                  const ComponentPrecomputedIndexes *indexes,
-                                  const CuMatrixBase<BaseFloat> &, // in_value
-                                  const CuMatrixBase<BaseFloat> &, // out_value
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  void *memo,
-                                  Component *, // to_update
-                                  CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)  return;
-  KALDI_ASSERT(out_deriv.NumRows() == in_deriv->NumRows() &&
-               in_deriv->NumCols() == input_dim_ &&
-               out_deriv.NumCols() == output_dim_);
-  int32 num_blocks = input_dim_ / output_dim_;
-  for (int32 i = 0; i < num_blocks; i++) {
-    CuSubMatrix<BaseFloat> in_deriv_block(*in_deriv, 0, in_deriv->NumRows(),
-                                          i * output_dim_, output_dim_);
-    in_deriv_block.CopyFromMat(out_deriv);
-  }
-}
-
-void SumReduceComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<SumReduceComponent>", "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_);
-  ExpectToken(is, binary, "<OutputDim>");
-  ReadBasicType(is, binary, &output_dim_);
-  ExpectToken(is, binary, "</SumReduceComponent>");
-}
-
-void SumReduceComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<SumReduceComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  WriteToken(os, binary, "<OutputDim>");
-  WriteBasicType(os, binary, output_dim_);
-  WriteToken(os, binary, "</SumReduceComponent>");
-}
-
-
 void ElementwiseProductComponent::Init(int32 input_dim, int32 output_dim)  {
   input_dim_ = input_dim;
   output_dim_ = output_dim;
@@ -5873,5 +5799,78 @@ void BatchNormComponent::ZeroStats() {
 }
 
 
+SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
+    input_dim_(other.input_dim_), output_dim_(other.output_dim_),
+    scale_(other.scale_) { }
+
+void SumBlockComponent::InitFromConfig(ConfigLine *cfl) {
+  scale_ = 1.0;
+  bool ok = cfl->GetValue("input-dim", &input_dim_) &&
+      cfl->GetValue("output-dim", &output_dim_);
+  if (!ok)
+    KALDI_ERR << "input-dim and output-dim must both be provided.";
+  if (input_dim_ <= 0 || input_dim_ % output_dim_ != 0)
+    KALDI_ERR << "Invalid values input-dim=" << input_dim_
+              << " output-dim=" << output_dim_;
+  cfl->GetValue("scale", &scale_);
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+}
+
+void SumBlockComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<SumBlockComponent>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "<Scale>");
+  ReadBasicType(is, binary, &scale_);
+  ExpectToken(is, binary, "</SumBlockComponent>");
+}
+
+void SumBlockComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SumBlockComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "<Scale>");
+  WriteBasicType(os, binary, scale_);
+  WriteToken(os, binary, "</SumBlockComponent>");
+}
+
+std::string SumBlockComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << input_dim_
+         << ", output-dim=" << output_dim_
+         << ", scale=" << scale_;
+  return stream.str();
+}
+
+void* SumBlockComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(out->NumRows() == in.NumRows() &&
+               out->NumCols() == output_dim_ &&
+               in.NumCols() == input_dim_);
+  out->AddMatBlocks(scale_, in, kNoTrans);
+  return NULL;
+}
+
+void SumBlockComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, //in_value
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (in_deriv) {
+    in_deriv->AddMatBlocks(scale_, out_deriv, kNoTrans);
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index a640470098e..4af8649515f 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -424,54 +424,6 @@ class RectifiedLinearComponent: public NonlinearComponent {
   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 };
 
-/**
-   This component is a fixed (non-trainable) nonlinearity that sums its inputs
-   to produce outputs.  Currently the only supported configuration is that its
-   input-dim is interpreted as consisting of n blocks, and the output is just a
-   summation over the n blocks, where  n = input-dim / output-dim, so for instance
-    output[n] = input[n] + input[block-size + n] + .... .
-   Later if needed we can add a configuration variable that allows you to sum
-   over 'interleaved' input.
- */
-class SumReduceComponent: public Component {
- public:
-  void Init(int32 input_dim, int32 output_dim);
-  explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
-    Init(input_dim, output_dim);
-  }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kLinearInInput;
-  }
-  SumReduceComponent(): input_dim_(0), output_dim_(0) { }
-  virtual std::string Type() const { return "SumReduceComponent"; }
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const { return output_dim_; }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &, // in_value
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *, // to_update
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
-                                                                  output_dim_); }
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-
- protected:
-  int32 input_dim_;
-  int32 output_dim_;
-};
-
 
 class FixedAffineComponent;
 class FixedScaleComponent;
@@ -1119,8 +1071,10 @@ class FixedBiasComponent: public Component {
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
 };
 
-// NoOpComponent just duplicates its input.  We don't anticipate this being used
-// very often, but it may sometimes make your life easier
+/** NoOpComponent just duplicates its input.  We don't anticipate this being used
+    very often, but it may sometimes make your life easier
+    The only config parameter it accepts is 'dim', e.g. 'dim=400'.
+*/
 class NoOpComponent: public NonlinearComponent {
  public:
   explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
@@ -1145,6 +1099,54 @@ class NoOpComponent: public NonlinearComponent {
   NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
 };
 
+/**  SumBlockComponent sums over blocks of its input: for instance, if
+     you create one with the config "input-dim=400 output-dim=100",
+     its output will be the sum over the 4 100-dimensional blocks of
+     the input.
+
+     The "scale" config parameter may be used if you want to do averaging
+     instead of summing, e.g. "input-dim=400 output-dim=100 scale=0.25"
+     will accomplish averaging.
+
+     Accepted values on its config-file line are:
+        input-dim  The input dimension.  Required.
+        output-dim  The block dimension.  Required.  Must divide input-dim.
+        scale      A scaling factor on the output.  Defaults to 1.0.
+ */
+class SumBlockComponent: public Component {
+ public:
+  explicit SumBlockComponent(const SumBlockComponent &other);
+  SumBlockComponent() { }
+  virtual std::string Type() const { return "SumBlockComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kLinearInInput|kPropagateAdds|kBackpropAdds;
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+  virtual Component* Copy() const { return new SumBlockComponent(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, //in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+ private:
+  int32 input_dim_;
+  int32 output_dim_;
+  BaseFloat scale_;
+  SumBlockComponent &operator = (const SumBlockComponent &other); // Disallow.
+};
+
+
 // ClipGradientComponent just duplicates its input, but clips gradients
 // during backpropagation if they cross a predetermined threshold.
 // This component will be used to prevent gradient explosion problem in
@@ -1415,9 +1417,23 @@ class PerElementScaleComponent: public UpdatableComponent {
   CuVector<BaseFloat> scales_;
 };
 
+/*
+  PerElementOffsetComponent offsets each dimension of its input with a separate
+  trainable bias; it's like an affine component with fixed weight matrix which
+  is always equal to I.
+
+  Accepted values on its config line, with defaults if applicable.
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
 
-// PerElementOffsetComponent offsets each dimension of its input with a separate
-// trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
+     dim              If 'vector' is not specified, you should specify the
+                      dimension 'dim', and will be randomly initialized according
+                      to 'param-mean' and 'param-stddev'.
+     param-mean=0.0   Mean of randomly initialized offset parameters.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters.
+
+*/
 class PerElementOffsetComponent: public UpdatableComponent {
  public:
   virtual int32 InputDim() const { return offsets_.Dim(); }
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index e1d58b34428..a138fcacceb 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1288,7 +1288,7 @@ void ComputeExampleComputationRequestSimple(
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
 
-  int32 n = RandInt(0, 33);
+  int32 n = RandInt(0, 32);
   BaseFloat learning_rate = 0.001 * RandInt(1, 100);
 
   std::ostringstream os;
@@ -1463,14 +1463,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " learning-rate=" << learning_rate << param_config;
       break;
     }
-    case 20: {
-      *component_type = "SumReduceComponent";
-      int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15),
-          input_dim = output_dim * group_size;
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim;
-      break;
-    }
-    case 21: {
+    case 20: case 21: {
       *component_type = "CompositeComponent";
       int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3),
           max_rows_process = RandInt(1, 30);
@@ -1598,7 +1591,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     }
     // I think we'll get in the habit of allocating a larger number of case
     // labels to the most recently added component, so it gets tested more
-    case 31: case 32: case 33: {
+    case 31: {
       *component_type = "BatchNormComponent";
       int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2);
       bool test_mode = (RandInt(0, 1) == 0);
@@ -1608,6 +1601,16 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << (test_mode ? "true" : "false");
       break;
     }
+    case 32: {
+      *component_type = "SumBlockComponent";
+      BaseFloat scale = 0.5 * RandInt(1, 3);
+      BaseFloat output_dim = RandInt(1, 10),
+          input_dim = output_dim * RandInt(1, 3);
+      os << "input-dim=" << input_dim
+         << " output-dim=" << output_dim
+         << " scale=" << scale;
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }