diff --git a/egs/cifar/v1/local/nnet3/run_cnn_1d.sh b/egs/cifar/v1/local/nnet3/run_cnn_1d.sh
new file mode 100755
index 00000000000..6baad31fcbb
--- /dev/null
+++ b/egs/cifar/v1/local/nnet3/run_cnn_1d.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+# 1d is as 1c but adding batch-norm to all convolutional layers.
+# batch-norm helps (0.78 -> 0.8).
+
+# exp/cnn1d_cifar10: num-iters=60 nj=1..2 num-params=4.3M dim=96->10 combine=-0.10->-0.08 loglike:train/valid[39,59,final]=(-0.03,-0.00,-0.00/-0.63,-0.69,-0.68) accuracy:train/valid[39,59,final]=(1.00,1.00,1.00/0.81,0.82,0.82)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+
+
+# training options
+stage=0
+train_stage=-10
+dataset=cifar10
+srand=0
+reporting_email=
+affix=1d
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+
+dir=exp/cnn${affix}_${dataset}
+
+egs=exp/${dataset}_egs
+
+if [ ! -d $egs ]; then
+  echo "$0: expected directory $egs to exist.  Run the get_egs.sh commands in the"
+  echo "    run.sh before this script."
+  exit 1
+fi
+
+# check that the expected files are in the egs directory.
+
+for f in $egs/egs.1.ark $egs/train_diagnostic.egs $egs/valid_diagnostic.egs $egs/combine.egs \
+         $egs/info/feat_dim $egs/info/left_context $egs/info/right_context \
+         $egs/info/output_dim; do
+  if [ ! -e $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1;
+  fi
+done
+
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(cat $egs/info/output_dim)
+
+  # Note: we hardcode in the CNN config that we are dealing with 32x3x color
+  # images.
+
+  common1="required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=32"
+  common2="required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=64"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=96 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=32 height-out=32 time-offsets=-1,0,1 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=32 height-out=16 time-offsets=-1,0,1 dropout-proportion=0.25 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=16 height-out=16 time-offsets=-1,0,1 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=16 height-out=8 time-offsets=-1,0,1 dropout-proportion=0.25 $common2 height-subsample-out=2
+  relu-dropout-layer name=fully_connected1 input=Append(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30) dropout-proportion=0.5 dim=512
+  output-layer name=output dim=$num_targets
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 2 ]; then
+
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=30 \
+    --egs.frames-per-eg=1 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.003 \
+    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.minibatch-size=256,128,64 \
+    --trainer.shuffle-buffer-size=2000 \
+    --egs.dir="$egs" \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 4341be630aa..0ab4a5e5f63 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -37,6 +37,7 @@
         'conv-relu-renorm-layer': xlayers.XconfigConvLayer,
         'relu-conv-batchnorm-layer': xlayers.XconfigConvLayer,
         'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer,
+        'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer,
         'conv-relu-dropout-layer': xlayers.XconfigConvLayer,
         'relu-dropout-layer': xlayers.XconfigBasicLayer
 
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 85e328709e0..5e3451b073d 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -41,6 +41,8 @@ int main(int argc, char *argv[]) {
         " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
 
     bool binary_write = true;
+    bool batchnorm_test_mode = false,
+        dropout_test_mode = true;
     std::string use_gpu = "yes";
     NnetCombineConfig combine_config;
     chain::ChainTrainingOptions chain_config;
@@ -49,6 +51,11 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("batchnorm-test-mode", &batchnorm_test_mode,
+                "If true, set test-mode to true on any BatchNormComponents.");
+    po.Register("dropout-test-mode", &dropout_test_mode,
+                "If true, set test-mode to true on any DropoutComponents and "
+                "DropoutMaskComponents.");
 
     combine_config.Register(&po);
     chain_config.Register(&po);
@@ -77,13 +84,10 @@ int main(int argc, char *argv[]) {
     Nnet nnet;
     ReadKaldiObject(raw_nnet_rxfilename, &nnet);
 
-    // This is needed for batch-norm.  We also ensure in the calling script
-    // that the freshest model comes first on the command line; this
-    // means we use the freshest batch-norm stats.  (Since the batch-norm
-    // stats are not technically parameters, they are not subject to
-    // combination like the rest of the model parameters).
-    SetBatchnormTestMode(true, &nnet);
-    SetDropoutTestMode(true, &nnet);
+    if (batchnorm_test_mode)
+      SetBatchnormTestMode(true, &nnet);
+    if (dropout_test_mode)
+      SetDropoutTestMode(true, &nnet);
 
     std::vector<NnetChainExample> egs;
     egs.reserve(10000);  // reserve a lot of space to minimize the chance of
diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc
index 49827490fab..8cf25d4ad08 100644
--- a/src/chainbin/nnet3-chain-compute-prob.cc
+++ b/src/chainbin/nnet3-chain-compute-prob.cc
@@ -52,7 +52,6 @@ int main(int argc, char *argv[]) {
 
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
                 "If true, set test-mode to true on any BatchNormComponents.");
-
     po.Register("dropout-test-mode", &dropout_test_mode,
                 "If true, set test-mode to true on any DropoutComponents and "
                 "DropoutMaskComponents.");
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index d6c376ab45e..2dd9b049d9b 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -19,6 +19,7 @@
 
 #include "nnet3/nnet-chain-combine.h"
 #include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-chain-training.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -38,7 +39,6 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
 
   if (combine_config_.sum_to_one_penalty != 0.0 &&
       combine_config_.enforce_sum_to_one) {
@@ -182,6 +182,18 @@ void NnetChainCombiner::Combine() {
     ComputeObjfAndDerivFromParameters(final_params, &deriv);
   }
   PrintParams(final_params);
+  if (HasBatchnorm(nnet_)) {
+    RecomputeBatchnormStats();
+  }
+}
+
+void NnetChainCombiner::RecomputeBatchnormStats() {
+  KALDI_LOG << "Recomputing batch-norm stats on nnet.";
+  NnetChainTrainingOptions train_opts;
+  train_opts.nnet_config.train = false;
+  NnetChainTrainer trainer(train_opts, den_fst_, &nnet_);
+  for (size_t i = 0; i < egs_.size(); i++)
+    trainer.Train(egs_[i]);
 }
 
 
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
index 3aeb3882650..4f4ed55202d 100644
--- a/src/nnet3/nnet-chain-combine.h
+++ b/src/nnet3/nnet-chain-combine.h
@@ -194,6 +194,7 @@ class NnetChainCombiner {
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();
+  void RecomputeBatchnormStats();
 
 };
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index e955710fff6..1a6bebd0e3d 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -58,8 +58,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
 
 
 void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
-  bool need_model_derivative = true;
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool need_model_derivative = nnet_config.train;
   bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
   ComputationRequest request;
   GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative,
@@ -73,16 +73,21 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, chain_eg.inputs);
   computer.Run();
-
   this->ProcessOutputs(chain_eg, &computer);
-  computer.Run();
-
-  UpdateParamsWithMaxChange();
+  if (nnet_config.train) {
+    computer.Run();
+    UpdateParamsWithMaxChange();
+  } else {
+    // all parameter derivs will be zero; here we're just adding the stored stats.
+    AddNnet(*delta_nnet_, 1.0, nnet_);
+    ScaleNnet(0.0, delta_nnet_);
+  }
 }
 
 
 void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
                                       NnetComputer *computer) {
+  bool train = opts_.nnet_config.train;
   // normally the eg will have just one output named 'output', but
   // we don't assume this.
   std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
@@ -111,7 +116,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
     ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
                              sup.supervision, nnet_output,
                              &tot_objf, &tot_l2_term, &tot_weight,
-                             &nnet_output_deriv,
+                             (train ? &nnet_output_deriv : NULL),
                              (use_xent ? &xent_deriv : NULL));
 
     if (use_xent) {
@@ -126,20 +131,21 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
                                         tot_weight, xent_objf);
     }
 
-    if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+    if (train && opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
       CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
       nnet_output_deriv.MulRowsVec(cu_deriv_weights);
       if (use_xent)
         xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
-    computer->AcceptInput(sup.name, &nnet_output_deriv);
+    if (train)
+      computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
                                      tot_weight, tot_objf, tot_l2_term);
 
-    if (use_xent) {
+    if (train && use_xent) {
       xent_deriv.Scale(opts_.chain_config.xent_regularize);
       computer->AcceptInput(xent_name, &xent_deriv);
     }
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index ba904b1c93a..c34d2e9631e 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -41,7 +41,6 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
               << " is nonzero, so setting --enforce-sum-to-one=false.";
     config_.enforce_sum_to_one = false;
   }
-  SetDropoutProportion(0, &nnet_);
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
@@ -178,6 +177,19 @@ void NnetCombiner::Combine() {
     ComputeObjfAndDerivFromParameters(final_params, &deriv);
   }
   PrintParams(final_params);
+
+  if (HasBatchnorm(nnet_)) {
+    RecomputeBatchnormStats();
+  }
+}
+
+void NnetCombiner::RecomputeBatchnormStats() {
+  KALDI_LOG << "Recomputing batch-norm stats on nnet.";
+  NnetTrainerOptions train_opts;
+  train_opts.train = false;
+  NnetTrainer trainer(train_opts, &nnet_);
+  for (size_t i = 0; i < egs_.size(); i++)
+    trainer.Train(egs_[i]);
 }
 
 
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index 5b60d30b8ed..fda89027cf9 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -240,6 +240,8 @@ class NnetCombiner {
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();
+  void RecomputeBatchnormStats();
+
 
 };
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 2a081920738..0bcf79ce662 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -55,7 +55,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
 
 
 void NnetTrainer::Train(const NnetExample &eg) {
-  bool need_model_derivative = true;
+  bool need_model_derivative = config_.train;
   ComputationRequest request;
   GetComputationRequest(*nnet_, eg, need_model_derivative,
                         config_.store_component_stats,
@@ -69,9 +69,16 @@ void NnetTrainer::Train(const NnetExample &eg) {
   computer.Run();
 
   this->ProcessOutputs(eg, &computer);
-  computer.Run();
 
-  UpdateParamsWithMaxChange();
+  if (config_.train) {
+    computer.Run();
+
+    UpdateParamsWithMaxChange();
+  } else {
+    // all parameter derivs will be zero; here we're just adding the stored stats.
+    AddNnet(*delta_nnet_, 1.0, nnet_);
+    ScaleNnet(0.0, delta_nnet_);
+  }
 }
 
 void NnetTrainer::ProcessOutputs(const NnetExample &eg,
@@ -85,7 +92,7 @@ void NnetTrainer::ProcessOutputs(const NnetExample &eg,
     if (nnet_->IsOutputNode(node_index)) {
       ObjectiveType obj_type = nnet_->GetNode(node_index).u.objective_type;
       BaseFloat tot_weight, tot_objf;
-      bool supply_deriv = true;
+      bool supply_deriv = config_.train;
       ComputeObjectiveFunction(io.features, obj_type, io.name,
                                supply_deriv, computer,
                                &tot_weight, &tot_objf);
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index 64a66612368..46897b51d88 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -33,6 +33,7 @@ namespace nnet3 {
 struct NnetTrainerOptions {
   bool zero_component_stats;
   bool store_component_stats;
+  bool train;
   int32 print_interval;
   bool debug_computation;
   BaseFloat momentum;
@@ -46,6 +47,7 @@ struct NnetTrainerOptions {
   NnetTrainerOptions():
       zero_component_stats(true),
       store_component_stats(true),
+      train(true),
       print_interval(100),
       debug_computation(false),
       momentum(0.0),
@@ -76,6 +78,10 @@ struct NnetTrainerOptions {
                    "write the cached computation to");
     opts->Register("binary-write-cache", &binary_write_cache, "Write "
                    "computation cache in binary mode");
+    opts->Register("train", &train, "If true, actually do the training "
+                   "(if false, it will do only the forward propagation, "
+                   "which affects stored stats for batch-norm, among other "
+                   "things.)");
 
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index fe9e9f91997..8950c009b91 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -469,6 +469,16 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
   }
 }
 
+bool HasBatchnorm(const Nnet &nnet) {
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    const BatchNormComponent *bc =
+        dynamic_cast<const BatchNormComponent*>(comp);
+    if (bc != NULL)
+      return true;
+  }
+  return false;
+}
 
 void SetBatchnormTestMode(bool test_mode,  Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 6f645d89c56..0b5faadc1b8 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -164,6 +164,11 @@ std::string NnetInfo(const Nnet &nnet);
 /// dropout_proportion value.
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
+
+/// Returns true if nnet has at least one component of type
+/// BatchNormComponent.
+bool HasBatchnorm(const Nnet &nnet);
+
 /// This function affects only components of type BatchNormComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 7885bb70b6b..30ad338ab4b 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -40,11 +40,18 @@ int main(int argc, char *argv[]) {
         " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n";
 
     bool binary_write = true;
+    bool batchnorm_test_mode = false,
+        dropout_test_mode = true;
     std::string use_gpu = "yes";
     NnetCombineConfig combine_config;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("batchnorm-test-mode", &batchnorm_test_mode,
+                "If true, set test-mode to true on any BatchNormComponents.");
+    po.Register("dropout-test-mode", &dropout_test_mode,
+                "If true, set test-mode to true on any DropoutComponents and "
+                "DropoutMaskComponents.");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
@@ -69,13 +76,10 @@ int main(int argc, char *argv[]) {
     Nnet nnet;
     ReadKaldiObject(nnet_rxfilename, &nnet);
 
-    // This is needed for batch-norm.  We also ensure in the calling script
-    // that the freshest model comes first on the command line; this
-    // means we use the freshest batch-norm stats.  (Since the batch-norm
-    // stats are not technically parameters, they are not subject to
-    // combination like the rest of the model parameters).
-    SetBatchnormTestMode(true, &nnet);
-    SetDropoutTestMode(true, &nnet);
+    if (batchnorm_test_mode)
+      SetBatchnormTestMode(true, &nnet);
+    if (dropout_test_mode)
+      SetDropoutTestMode(true, &nnet);
 
     std::vector<NnetExample> egs;
     egs.reserve(10000);  // reserve a lot of space to minimize the chance of
@@ -99,14 +103,11 @@ int main(int argc, char *argv[]) {
         ReadKaldiObject(po.GetArg(1 + n), &nnet);
         combiner.AcceptNnet(nnet);
       }
-
       combiner.Combine();
 
-
 #if HAVE_CUDA==1
       CuDevice::Instantiate().PrintProfile();
 #endif
-
       WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write);
     } else {
       KALDI_LOG << "Copying the single input model directly to the output, "
diff --git a/src/nnet3bin/nnet3-compute-prob.cc b/src/nnet3bin/nnet3-compute-prob.cc
index a67e76976c4..3dc4de3a662 100644
--- a/src/nnet3bin/nnet3-compute-prob.cc
+++ b/src/nnet3bin/nnet3-compute-prob.cc
@@ -51,7 +51,6 @@ int main(int argc, char *argv[]) {
 
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
                 "If true, set test-mode to true on any BatchNormComponents.");
-
     po.Register("dropout-test-mode", &dropout_test_mode,
                 "If true, set test-mode to true on any DropoutComponents and "
                 "DropoutMaskComponents.");