diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index 753dfc632ba..b927cc86823 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -18,7 +18,7 @@
 #
 #
 # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp
-# System                tdnn7n_sp tdnn7m26j_sp
+# System                tdnn7n_sp tdnn7m26o_sp
 # WER on train_dev(tg)      12.18     11.74
 # WER on train_dev(fg)      11.12     10.69
 # WER on eval2000(tg)        14.9      14.6
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index c936061de26..3a767721c6d 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -64,7 +64,10 @@ DenominatorComputation::DenominatorComputation(
                                      nnet_output.NumRows(),
                                      kUndefined, kStrideEqualNumCols);
   exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans);
-  exp_nnet_output_transposed_.ApplyExp();
+  // We limit the nnet output to the range [-30,30] before doing the exp;
+  // this avoids NaNs appearing in the forward-backward computation, which
+  // is not done in log space.
+  exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0);
 }
 
 
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index f2926ddc2f1..6b99a77e73b 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -200,6 +200,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
                          MatrixDim d);
 void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit);
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit);
 void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
                              const double* in, int in_stride);
 void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 50dd3d1d0ca..934a860a055 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) {
   }
 }
 
+template<typename Real>
+__global__
+static void _apply_exp_limited(Real* mat, MatrixDim d,
+                               Real lower_limit, Real upper_limit) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  if (i < d.cols && j < d.rows) {
+    Real x = mat[index];
+    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
+    // nan's will be set to the lower-limit.
+    if (!(x >= lower_limit))
+      x = lower_limit;
+    else if (x > upper_limit)
+      x = upper_limit;
+    mat[index] = exp(x);
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _scale_diag_packed(Real* mat, Real value, int dim) {
@@ -3734,6 +3754,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
@@ -4430,6 +4455,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
+
+
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index fe706815a44..8f719a8c4a1 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -345,6 +345,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   cudaF_apply_exp(Gr, Bl, mat, d);
 }
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                                   double lower_limit, double upper_limit) {
+  cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                                   float lower_limit, float upper_limit) {
+  cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
 inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out,
                                    MatrixDim out_dim, const double* in,
                                    int in_stride) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 33db8b3e625..01030bb8353 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixApplyExpLimited() {
+  int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20;
+  Matrix<Real> H(M, N);
+  H.SetRandn();
+
+
+  BaseFloat lower_limit = -0.2, upper_limit = 0.2;
+
+  CuMatrix<Real> D(H);
+
+  D.ApplyExpLimited(lower_limit, upper_limit);
+
+
+  H.ApplyFloor(lower_limit);
+  H.ApplyCeiling(upper_limit);
+  H.ApplyExp();
+
+  Matrix<Real> H2(D);
+
+  AssertEqual(H,H2);
+}
+
+
 
 template<typename Real>
 static void UnitTestCuMatrixSigmoid() {
@@ -2895,6 +2919,7 @@ static void UnitTestCuMatrixEqualElementMask() {
 
 template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyExpSpecial<Real>();
+  UnitTestCuMatrixApplyExpLimited<Real>();
   UnitTextCuMatrixAddSmatMat<Real>();
   UnitTextCuMatrixAddMatSmat<Real>();
   UnitTextCuMatrixAddSmat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 34290561cc5..beccd9dc4a5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2498,6 +2498,37 @@ void CuMatrixBase<Real>::ApplyExp() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
+  KALDI_ASSERT(upper_limit > lower_limit);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = num_rows_, num_cols = num_cols_;
+    for (int32 r = 0; r < num_rows; r++) {
+      Real *row_data = this->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        Real x = row_data[c];
+        if (!(x >= lower_limit))
+          x = lower_limit;
+        if (x > upper_limit)
+          x = upper_limit;
+        row_data[c] = Exp(x);
+      }
+    }
+  }
+}
+
+
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExpSpecial() {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 86c50cfc485..03e69b639d3 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -399,6 +399,13 @@ class CuMatrixBase {
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
 
+
+  /// This is equivalent to running:
+  /// ApplyFloor(lower_limit);
+  /// ApplyCeiling(upper_limit);
+  /// ApplyExp()
+  void ApplyExpLimited(Real lower_limit, Real upper_limit);
+
   /// For each element x of the matrix, set it to
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 669e5112793..00a31fa897c 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate(
   BaseFloat dropout_proportion = dropout_proportion_;
   KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0);
 
-  if (dropout_proportion_ == 0) {
+  if (dropout_proportion == 0) {
     out->Set(1.0);
     return NULL;
   }
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index f9f286aaed2..4eb078c0fcb 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
                    // for the preconditioner actually exceeds the memory for the
                    // parameters (by "rank").
       update_period = 10;
-  BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_minibatch = 0.0;
+  BaseFloat num_samples_history = 2000.0, alpha = 4.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
-  cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch);
-  if (max_change_per_minibatch != 0.0)
-    KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'";
   InitLearningRatesFromConfig(cfl);
   std::string filename;
   // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 9d438678f5d..3929c253aab 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1446,6 +1446,19 @@ class PermuteComponent: public Component {
    trainable scale; it's like a linear component with a diagonal matrix.  This
    version (and its child class NaturalGradientPerElementScaleComponent)
    requires the input for backprop.  See also ScaleAndOffsetComponent.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
 */
 class PerElementScaleComponent: public UpdatableComponent {
  public:
@@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent {
 
 
 
-// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
-// it uses a natural gradient update for the per-element scales.
+/**
+   NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
+   it uses a natural gradient update for the per-element scales.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
+
+  And the natural-gradient-related configuration values:
+      rank=8
+      update-period=10
+      num-samples-history=2000.0
+      alpha=4.0
+*/
 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public: